/************************************************************************/
/*                                                                      */
/*    vspline - a set of generic tools for creation and evaluation      */
/*              of uniform b-splines                                    */
/*                                                                      */
/*            Copyright 2015 - 2017 by Kay F. Jahnke                    */
/*                                                                      */
/*    The git repository for this software is at                        */
/*                                                                      */
/*    https://bitbucket.org/kfj/vspline                                 */
/*                                                                      */
/*    Please direct questions, bug reports, and contributions to        */
/*                                                                      */
/*    kfjahnke+vspline@gmail.com                                        */
/*                                                                      */
/*    Permission is hereby granted, free of charge, to any person       */
/*    obtaining a copy of this software and associated documentation    */
/*    files (the "Software"), to deal in the Software without           */
/*    restriction, including without limitation the rights to use,      */
/*    copy, modify, merge, publish, distribute, sublicense, and/or      */
/*    sell copies of the Software, and to permit persons to whom the    */
/*    Software is furnished to do so, subject to the following          */
/*    conditions:                                                       */
/*                                                                      */
/*    The above copyright notice and this permission notice shall be    */
/*    included in all copies or substantial portions of the             */
/*    Software.                                                         */
/*                                                                      */
/*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND    */
/*    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES   */
/*    OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND          */
/*    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT       */
/*    HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,      */
/*    WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING      */
/*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR     */
/*    OTHER DEALINGS IN THE SOFTWARE.                                   */
/*                                                                      */
/************************************************************************/

/// \file transform.h
///
/// \brief set of generic remap, transform and apply functions
///
/// My foremost reason to have efficient B-spline processing is the formulation of
/// generic remap-like functions. remap() is a function which takes an array of real-valued
/// nD coordinates and an interpolator over a source array. Now each of the real-valued
/// coordinates is fed into the interpolator in turn, yielding a value, which is placed
/// in the output array at the same place the coordinate occupies in the coordinate
/// array. To put it concisely, if we have
///
/// - c, the coordinate array (or 'warp' array)
/// - a, the source array
/// - i, the interpolator over a
/// - j, a coordinate into c and t
/// - t, the target array
///
/// remap defines the operation
///
/// t[j] = i(c[j]) for all j
///
/// Now we widen the concept of remapping to a 'transform'
/// function. Instead of limiting the process to the use of an 'interpolator', we use
/// an arbitrary unary functor transforming incoming values to outgoing values, where
/// the type of the incoming and outgoing values is determined by the functor. If the
/// functor actually is an interpolator, we have a 'true' remap transforming coordinates
/// into values, but this is merely a special case. So here we have:
///
/// - c, an array containing input values
/// - f, a unary functor converting input to output values
/// - j, a coordinate into c and t
/// - t, the target array
///
/// transform performs the operation
///
/// t[j] = f(c[j]) for all j
///
/// remaps/transforms to other-dimensional objects are supported. This makes it possible to,
/// for example, remap from a volume to a 2D image, using a 2D coordinate array containing
/// 3D coordinates.
///
/// There is also a variant of this transform function in this file, which doesn't take an
/// input array. Instead, for every target location, the location's discrete coordinates
/// are passed to the unary_functor type object. This way, transformation-based remaps
/// can be implemented easily: the user code just has to provide a suitable functor
/// to yield values for coordinates. This functor will internally take the discrete
/// incoming coordinates (into the target array) and take it from there, eventually
/// producing values of the target array's value_type.
/// Here we have:
///
/// - f, a unary functor converting discrete coordinates to output values
/// - j, a discrete coordinate into t
/// - t, the target array
///
/// 'index-based' transform performs the operation
///
/// t[j] = f(j) for all j
///
/// This file also has code to evaluate a b-spline at positions in a mesh grid, which can
/// be used for scaling, and for separable geometric transformations.
///
/// Finally there is a function to restore the original data from a b-spline to the
/// precision possible with the given data type and degree of the spline. This is done
/// with a call to transform for 1D splines, and a grid_eval for higher dimensions.
///
/// The current implementation of the remap functionality uses a straightforward mode of
/// operation, which factors out the various needed tasks into separate bits of code. The
/// result data are acquired by 'pulling' them into the target array by repeatedly calling
/// a functor yielding the results. This functor is a closure containing all logic needed
/// to produce the result values in scan order of the target array. While remap, transform
/// and grid_eval should cover most use cases, it's quite possible to use the routine fill()
/// itself, passing in a suitable functor - but note it's in namespace vspline::detail.
///
/// While the code presented here is quite involved and there are several types and routines
/// the use(fulness) of which isn't immediately apparent, most use cases will be able to get
/// by using only remap() or transform(). Calling these functions is simplified by
/// the fact that their template arguments match function parameters. Hence remap and
/// transform can be called without specifying the template arguments.
///
/// Note: Currently, the calls to multithread() are hardwired to use partition_to_tiles()
/// as their partitioner. partition_to_tiles() falls back to partition_to_stripes() if
/// it's 'own' partitioning scheme fails to produce the desired number of parts or if
/// the data are not 2D. This way, most use cases should receive adequate treatment.
///
/// Coding remap functions for vspline is an interesting problem, because of vspline's
/// scope. We want a solution which is dimension-agnostic, can handle all of vspline's
/// potential value types, multithreads, and vectorizes transparently for such types
/// which can be used with hardware vectorization, automatically falling back to
/// unvectorized code if the value_type in question can't be vectorized. On top of
/// that it should scale well and hide all this complexity in the implementation,
/// providing only a clean, simple interface without the scary detail.
///
/// It turns out that all these demands can be taken into account at the same time.
/// The current solution is reasonably complex, but 'does the trick'.

#ifndef VSPLINE_TRANSFORM_H
#define VSPLINE_TRANSFORM_H

#include "multithread.h"
#include "eval.h"

namespace vspline {

using namespace std ;
using namespace vigra ;

template < int dimension >
using bcv_type = vigra::TinyVector < bc_code , dimension > ;

// we start out with the workhorse code.
// The implementation of remap(), transform() etc. is after namespace detail.

namespace detail {

/// struct _fill contains the implementation of the 'engine' used for transform-like
/// functions. The design logic is this: a transform will ultimately produce an array
/// of results. This array is filled in standard scan order sequence by repeated
/// calls to a functor containing all the logic to produce values in the required
/// order. The functor is like a closure, being set up initially with all parameters
/// needed for the task at hand (like with a warp array, a transformation, a genuine
/// generator function etc.). Since the functor controls the path of the calculation
/// from whatever starting point to the production of the final result, there are no
/// intermediate containers for intermediate results. Since the remap process is
/// mainly memory-bound, this strategy helps keeping memory use low. The data can
/// be produced one by one, but the code has vectorized operation as well, which
/// brings noticeable performance gain. With vectorized operation, instead of producing
/// single values, the engine produces vectors of values. This operation is transparent
/// to the caller, since the data are picked up and deposited in normal interleaved
/// fashion. The extra effort for vectorized operation is in the implementation of the
/// generator functor and reasonably straightforward. If only the standard remap
/// functions are used, the user can remain ignorant of the vectorization.
///
/// struct _fill's operator() takes an object of class generator_type. This object
/// has to satisfy a few requirements:
///
/// - it has to have an overloaded operator() accepting two signatures: one taking
///   a pointer to vsize value_type, one taking a reference to a single value_type.
///   these arguments specify where to deposit the generator's output.
///
/// - it has to offer a bindOuter routine producing a subdimensional generator
///   to supply values for a slice of output
///
/// - it has to offer a subrange routine, limiting output to a subarray
///   of the 'whole' output

//  TODO might write an abstract base class specifying the interface

/// In the current implementation, the hierarchical descent to subdimensional slices
/// is always taken to the lowest level, leaving the actual calls of the functor to
/// occur there. While the hierarchical access may consume some processing time, mainly
/// to establish the bounds for the 1D operation - but possibly optimized away,
/// the operation on 1D data can use optimizations which gain more than is needed
/// for the hierarchical descent. Especially when vectorized code is used, operation
/// on 1D data is very efficient, since the data can be accessed using load/store
/// or gather/scatter operations, even when the arrays involved are strided.
/// Taking the hierarchical descent down to level 0 is encoded in fill() and it's
/// workhorse code, the generator objects implemented here depend on the descent
/// going all the way down to 1D.
///
/// Note: support of hardware-assisted gather/scatter operations is reasonably new
/// in Vc. On my (AVX2) system, using hardware gather/scatter increases performance
/// for my typical applications in the order of magnitude of 10%, so if your code
/// is time-critical, make sure your Vc is up-to-date - best build from source.

// TODO while the current implementation has to issue load/store operations without
// passing Vc::Aligned, since there is no guarantee that individual lines of data
// are aligned, using special MultiArrays where the underlying memory's shape along
// dimension 0 is extended to coincide with a vector boundary would allow using
// aligned operation. But this would require further specialization, and/or looking
// at the array's strides. Alternatively, a method Vc uses (hinted at further down)
// processes single data until it hits an aligned memory location, from where on it
// processes vectorized with aligned operations. I followed this lead but could not
// produce a performance gain.

// Note the third template argument, _vsize. This only comes into play at level 0
// (see the specializations below) and is needed to differentiate between cases
// where the operation can be vectorized (because the underlying functor can do it)
// and situations where it can't be (like, when using functors with only single-value
// eval), in which case _vsize comes in as 1 and the code automatically falls back
// to unvectorized mode.

template < typename generator_type  , // functor object yielding values
           int dim_out ,              // number of dimensions of output array
           int _vsize = 0 >
struct _fill
{
  void operator() ( generator_type & gen ,
                    MultiArrayView < dim_out , typename generator_type::value_type >
                      & output )
  {
    // we're not yet at the intended lowest level of recursion,
    // so we slice output and generator and feed the slices to the
    // next lower recursion level
    for ( int c = 0 ; c < output.shape ( dim_out - 1 ) ; c++ )
    {
      // recursively call _fill for each slice along the highest axis
      auto sub_output = output.bindOuter ( c ) ;
      auto sub_gen = gen.bindOuter ( c ) ;
      _fill < decltype ( sub_gen ) , dim_out - 1 , generator_type::vsize >()
        ( sub_gen , sub_output ) ;
    }
  }
} ;

// browsing Vc's code base, I noticed the undocumented functions
// simd_for_each and simd_for_each_n, which do simple iterations over
// contiguous single-channel memory - the code is in 'algorithms.h'.
// what's interesting there is that the code iterates with scalar
// values until it has reached an aligned address. then it continues
// by passing vectors to the unary functor as long as full vectors
// can be found, and finally the remaining values are also passed as
// scalars. The effect is that the central loop which is processing
// vectors will certainly load from an aligned adress, and hence the
// load operation can be issued with Vc::Aligned set true.
// #defining RUNUP_TO_ALIGNED implements this behaviour, but in my
// tests, the resulting code was slower. I suspect this is due to
// the runup code making it harder for the optimizer. Another factor
// is my specific vector unit (AVX2) - AFAIK AVX2 handles unaligned
// access efficiently, while older vector units may perform badly
// with unaligned access. for these vector units, using RUNUP_TO_ALIGNED
// might produce performance gain TODO try

// #define RUNUP_TO_ALIGNED

/// specialization of _fill for level 0 ends the recursive descent.
/// Here, with template argument _vsize unfixed, we have the vector code,
/// below is a specialization for _vsize == 1 which is unvectorized.

template < typename generator_type , int _vsize >
struct _fill < generator_type , 1 , _vsize >
{
  typedef typename generator_type::value_type value_type ;
    
  // get the functor's type and use it to fix a few types needed for
  // vectorized operation

  typedef typename generator_type::functor_type functor_type ;
  
  enum { dimension = functor_type::dim_out } ;
  enum { vsize = functor_type::vsize } ;
  enum { advance = dimension * vsize } ;
  
  typedef typename functor_type::out_v out_v ;
  typedef typename functor_type::out_ele_type ele_type ;
  typedef typename functor_type::out_ele_v ele_v ;
  
  typedef typename vspline::vector_traits < ele_type , vsize > :: index_type index_type ;

  inline void store ( const ele_v & src ,
                      ele_type * dp )
  {
#ifdef RUNUP_TO_ALIGNED
    src.store ( dp , Vc::Aligned ) ;
#else
    src.store ( dp ) ;
#endif
  }

  // compiler needs this overload, but it is never called
  // TODO: avoid it altogether
  inline void store ( const TinyVector < ele_v , dimension > & src ,
                      ele_type * dp )
  {
    assert ( dimension == 1 ) ;
    src[0].store ( dp ) ;
  }
  
  inline void scatter ( const ele_v & src ,
                        ele_type * dp ,
                        const index_type & indexes )
  {
    src.scatter ( dp , indexes ) ;
  }

  inline void scatter ( const TinyVector < ele_v , dimension > & src ,
                        ele_type * dp ,
                        const index_type & indexes )
  {
    for ( int e = 0 ; e < dimension ; e++ )
      src[e].scatter ( dp + e , indexes ) ;
  }

  void operator() ( generator_type & gen ,
                    MultiArrayView < 1 , typename generator_type::value_type >
                      & output )
  {
    auto target_it = output.begin() ;  
    int leftover = output.elementCount() ;

    ele_type * dp = (ele_type*) ( output.data() ) ;
    
#ifdef RUNUP_TO_ALIGNED

    while ( leftover && ( ! vspline::is_aligned ( dp ) ) )
    {
      gen ( *target_it ) ;
      ++target_it ;
      --leftover ;
      dp = (ele_type*) &(*target_it) ;
    }

#endif

    int aggregates = leftover / vsize ; // number of full vectors
    leftover -= aggregates * vsize ;    // remaining leftover single values

    out_v target_buffer ;
    
    if ( output.isUnstrided() )
    {
      if ( dimension == 1 )
      {
        // best case: unstrided operation on 1D data, we can use
        // efficient SIMD store operation        
        for ( int a = 0 ; a < aggregates ; a++ , dp += advance )
        {
          gen ( target_buffer ) ;
          // and store it to destination with a SIMD store.
          store ( target_buffer , dp ) ;
        }
      }
      else
      {
        // second best: unstrided operation on nD data
        for ( int a = 0 ; a < aggregates ; a++ , dp += advance )
        {
          gen ( target_buffer ) ;
          // and store it to destination with a scatter operation.
          scatter ( target_buffer , dp , index_type::IndexesFromZero() * dimension ) ;
        }
      }
    }
    else
    {
      // worst case: strided operation. here, instead of using 'advance'
      // directly (which is compile-time constant and therefore potentially
      // very good for the optimizer) we have to use a run-time value for
      // advancing dp.
      auto strided_advance = advance * output.stride(0) ;
      for ( int a = 0 ; a < aggregates ; a++ , dp += strided_advance )
      {
        // here we generate to a simdized target type
        gen ( target_buffer ) ;
        // and store it to destination using a scatter operation.
        scatter ( target_buffer , dp ,
                  index_type::IndexesFromZero() * dimension * output.stride(0) ) ;
      }
    }        
    // if there aren't any leftovers, we can return straight away.
    if ( ! leftover )
      return ;

    // otherwise, advance target_it to remaining single values
    target_it += aggregates * vsize ;
    
    // process leftovers. If vc isn't used, this loop does all the processing
    while ( leftover-- )
    {
      // process leftovers with single-value evaluation
      gen ( *target_it ) ;
      ++target_it ;
    }
  }
} ;

/// unvectorized variant of 1D _fill object. This is very straightforward.

template < typename generator_type >
struct _fill < generator_type , 1 , 1 >
{
  typedef typename generator_type::value_type value_type ;

  void operator() ( generator_type & gen ,
                    MultiArrayView < 1 , typename generator_type::value_type >
                      & output )
  {
    auto target_it = output.begin() ;  
    auto target_end = output.end() ;  

    // process leftovers. If vc isn't used, this loop does all the processing
    while ( target_it != target_end )
    {
      // process leftovers with single-value evaluation
      gen ( *target_it ) ;
      ++target_it ;
    }
  }
} ;

/// single-threaded fill. This routine receives the range to process and the generator
/// object capable of providing result values. The generator object is set up to provide
/// values for the desired subrange and then passed to _fill, which handles the calls to
/// the generator object and the depositing of the result values into the target array.

template < typename generator_type  , // functor object yielding values
           int dim_out >              // number of dimensions of output array
void st_fill ( shape_range_type < dim_out > range ,
               generator_type * const       p_gen ,
               MultiArrayView < dim_out , typename generator_type::value_type > * p_output )
{
  // pick out output's subarray specified by 'range'

  auto output = p_output->subarray ( range[0] , range[1] ) ;
  
  // get a new generator to cover the same range. we need an instance here!
  // the generator carries state, we're in the single thread, processing one
  // chunk out of the partitioning, so the generator we have here won't be
  // used by other threads (which would be wrong, since it carries state).
  // but it may be subdivided into yet more generators if fill decides to slice
  // it and process slices.
  
  auto gen = p_gen->subrange ( range ) ;
  
  // have the results computed and put into the target

  _fill < generator_type , dim_out , generator_type::vsize >() ( gen , output ) ;
}

/// multithreaded fill. This is the top-level fill routine. It takes a functor capable
/// of delivering successive result values (in the target array's scan order), and calls
/// this functor repeatedly until 'output' is full.
/// this task is distributed to several worker threads by means of 'multithread', which in
/// turn uses st_fill, the single-threaded fill routine.

template < typename generator_type  , // functor object yielding values
           int dim_target >           // number of dimensions of output array
void fill ( generator_type & gen ,
            MultiArrayView < dim_target , typename generator_type::value_type >
              & output )
{
  // set up 'range' to cover a complete array of output's size
  
  shape_range_type < dim_target > range ( shape_type < dim_target > () ,
                                          output.shape() ) ;

  // heuristic. minumum desired number of partitions; partition_to_tiles
  // only uses this value when it delegates to partition_to_stripes.

  int njobs = vspline::common_thread_pool.get_nthreads() ;

  // call multithread(), specifying the single-threaded fill routine as the
  // functor to invoke the threads with, and the partitioner to use on 'range'.
  // next come desired number of partitions and the original, 'whole' range,
  // followed by the other parameters the single-threaded fill needs, which is
  // pretty much the set of parameters we've received here, with the difference
  // that we don't pass anything on by reference and use pointers instead.

  multithread ( & detail::st_fill < generator_type , dim_target > ,
                vspline::partition_to_tiles < dim_target > ,
                njobs ,        // desired number of partitions
                range ,        // 'full' range which is to be partitioned
                &gen ,         // generator_type object
                &output ) ;    // target array
} ;

/// Next we code 'generators' for use with fill(). These objects can yield values
/// to the fill routine, each in it's specific way. The first type we define is
/// warp_generator. This generator yields data from an array, which, in the context
/// of a remap-like function, will provide the coordinates to feed to the interpolator.
/// Seen from the generalized context, it provides arguments to the functor to use
/// to produce result values, and might more aptly be called something like 'picker',
/// since it picks successive batches of input values from the input array.
///
/// First is warp_generator for dimensions > 1. Here we provide 'subrange' and
/// 'bindOuter' to be used for the hierarchical descent in _fill. The current
/// implementation relies of the hierarchical descent going all the way to 1D,
/// and does not implement operator() until the 1D specialization.
///
/// note the flag strided_warp. If the warp array is strided in dimension 0,
/// this flag has to be set true.

template < int dimension ,
           typename unary_functor_type ,
           bool strided_warp >
struct warp_generator
{
  typedef unary_functor_type functor_type ;
  
  typedef typename unary_functor_type::out_type value_type ;
  typedef typename unary_functor_type::in_type nd_rc_type ;
  enum { vsize = unary_functor_type::vsize } ;
  
  typedef MultiArrayView < dimension , nd_rc_type > warp_array_type ;
  
  const warp_array_type warp ; // must not use reference here!
  
  const unary_functor_type & itp ;
  
  const unary_functor_type & get_functor()
  {
    return itp ;
  }
  
  warp_generator
    ( const warp_array_type & _warp ,
      const unary_functor_type & _itp )
  : warp ( _warp ) ,
    itp ( _itp )
  { } ;

  warp_generator < dimension ,
                   unary_functor_type ,
                   strided_warp >
    subrange ( const shape_range_type < dimension > & range ) const
  {
    return warp_generator < dimension ,
                            unary_functor_type ,
                            strided_warp >
             ( warp.subarray ( range[0] , range[1] ) , itp ) ;
  }
  
  warp_generator < dimension - 1 ,
                   unary_functor_type ,
                   strided_warp >
    bindOuter ( const int & c ) const
  {
    return warp_generator < dimension - 1 ,
                            unary_functor_type ,
                            strided_warp >
             ( warp.bindOuter ( c ) , itp ) ;
  }  
} ;

/// here we have the 1D specialization of warp_generator, where the actual
/// processing takes place.

template < typename unary_functor_type ,
           bool strided_warp >
struct warp_generator < 1 ,
                        unary_functor_type ,
                        strided_warp >
{
  typedef unary_functor_type functor_type ;
  
  typedef typename unary_functor_type::in_type nd_rc_type ;
  enum { dimension = unary_functor_type::dim_in } ;

  typedef typename unary_functor_type::out_type value_type ;
  enum { vsize = unary_functor_type::vsize } ;
  
  typedef MultiArrayView < 1 , nd_rc_type > warp_array_type ;
  
  const warp_array_type warp ; // must not use reference here!
  
  typedef typename unary_functor_type::in_ele_type ele_type ;
  const ele_type * dp ;
  typename warp_array_type::const_iterator witer ;
  
  const unary_functor_type & itp ;
  
  const unary_functor_type & get_functor()
  {
    return itp ;
  }
  
  warp_generator
    ( const warp_array_type & _warp ,
      const unary_functor_type & _itp )
  : warp ( _warp ) ,
    itp ( _itp ) ,
    witer ( _warp.begin() ) ,
    dp ( (ele_type*) ( _warp.data() ) )
  {
  } ;

  /// If vectorization isn't used, this routine does all the work.
  /// This is the overload taking a straight value_type & as it's
  /// argument. Below is code for vectorized operation.
  /// We dispatch on strided_warp:

  void operator() ( value_type & target )
  {
    operator() ( target ,
                 std::integral_constant < bool , strided_warp > () ) ;
  }

  /// unvectorized operator() for strided warp arrays
  
  void operator() ( value_type & target , std::true_type )
  {
    itp.eval ( *((nd_rc_type*)dp) , target ) ;
    dp += dimension * warp.stride(0) ;
  }

  /// unvectorized operator() for unstrided warp arrays
  
  void operator() ( value_type & target , std::false_type )
  {
    itp.eval ( *((nd_rc_type*)dp) , target ) ;
    dp += dimension ;
  }

#ifdef USE_VC

  enum { advance = dimension * vsize } ;
  
  typedef typename vector_traits < ele_type , vsize > :: ele_v ele_v ;
  typedef typename vspline::vector_traits < ele_type , vsize > :: index_type index_type ;

  const index_type indexes
  = vspline::vector_traits < ele_type , vsize > :: IndexesFromZero() * dimension ;
  
  typedef typename unary_functor_type::in_ele_v source_ele_type ;
  typedef vigra::TinyVector < source_ele_type , dimension > source_type ;
  
  // initially I implemented a single operator() with conditionals on
  // strided_warp and dimension, expecting that the compiler would
  // pick out the right code without performance impact, but this turned
  // out wrong. so now I'm using a dispatch mechanism which picks the
  // appropriate code, effectively forcing the compiler to do the right
  // thing. TODO: this teaches me a lesson. I think I have relied on
  // dead code elimination in several places, so I may have to go through
  // the inner loops looking for similar situations. The performance
  // difference was not large but consistently measurable.
  
  /// dispatch to the operator() variant for strided or unstrided warp.
  /// while the code for both variants is very similar, the differentiation
  /// is important, because the unstrided case can use advance (which is
  /// a compile-time constant) directly, while the second case has to
  /// multiply with the stride, which is a run-time value.
  /// we write this as a member function template, making it a worse match
  /// for operator() ( value_type & ) and so assuring that this overload
  /// will only match if T is *not* a straight value_type, in which case
  /// we can be assured that we're running vector code.
  
  template < class T >
  inline void operator() ( T & target )
  {
    static_assert ( vsize > 1 , "this code must not be called for vsize == 1" ) ;
    operator() ( target ,
                 std::integral_constant < bool , strided_warp > () ) ;
  }
  
  /// vectorized variant of operator() for strided warp arrays
  /// here we don't need to dispatch further, since the stride forces
  /// us to use gather operations even for 1D data.
  
  template < class T >
  inline void operator() ( T & target ,
                           std::true_type )       // strided warp array
  {
    source_type buffer ;
    
    for ( int e = 0 ; e < dimension ; e++ )
      buffer[e].gather
        ( dp + e , indexes * warp.stride(0) ) ; 

    itp.eval ( unwrap(buffer) , target ) ;
    dp += advance * warp.stride(0) ;
  }
  
  /// vectorized variant of operator() for unstrided warp arrays
  /// this variant of operator() further dispatches on 1D/nD data, which
  /// would be futile for strided data (which have to use gather anyway)
  /// but, with unstrided data, if the data are 1D, can result in a (fast)
  /// SIMD load operation. Otherwise it's gathers.
  
  template < class T >
  inline void operator() ( T & target ,
                           std::false_type )       // unstrided warp array
  {
    source_type buffer ;
    
    load ( buffer ,
           std::integral_constant < bool , dimension == 1 > () ) ;

    itp.eval ( unwrap(buffer) , target ) ;
    dp += advance ;
  }
  
  /// loading 1D data from unstrided memory can use SIMD load instruction:
  
  inline
  void load ( source_type & buffer ,
              std::true_type         // data are 1D, use SIMD load
            )
  {
    buffer[0].load ( dp ) ;
  }
  
  /// nD data have to be gathered instead, and buffer is indexable

  inline void
  load ( source_type & buffer ,
         std::false_type        // not 1D, use gather
       )
  {
    for ( int e = 0 ; e < dimension ; e++ )
      buffer[e].gather ( dp + e , indexes ) ;
  }

#endif

  /// subrange is used to create a warp_generator from part of the data
  /// while we are at the lowest level here, we still need the subrange routine
  /// for cases where the data are 1D in the first place: in this situation,
  /// we need to be able to split up the range as well.

  warp_generator < 1 ,
                   unary_functor_type ,
                   strided_warp >
    subrange ( const shape_range_type < 1 > & range ) const
  {
    return warp_generator < 1 ,
                            unary_functor_type ,
                            strided_warp >
             ( warp.subarray ( range[0] , range[1] ) , itp ) ;
  }

} ;

/// for transform() from indexes we need a different generator object: here we don't
/// pick input values at successive locations from an array, but instead pass the nD
/// indices which correspond to these locations - and are the same at which
/// output will be stored, as well. In fact it is feasible to implement
/// warp_generator using index_generator, by simply picking data from the input array
/// at the indexes index_generator produces. I tried that, but due to the index maths
/// needed, it came out slower than the implementation I give here.
///
/// class index_generator provides nD indices as input to it's functor which coincide
/// with the location in the target array for which the functor is called. The data type
/// of these indices is derived from the functor's input type. Again we presume that
/// fill() will recurse to level 0, so index_generator's operator() will only be called
/// at the lowest level of recursion, and we needn't even define it for higher levels.

template < typename unary_functor_type ,
           int level >
struct index_generator
{
  typedef unary_functor_type functor_type ;
  
  typedef typename unary_functor_type::out_type value_type ;

  enum { dimension = unary_functor_type::dim_in } ;
  enum { vsize = unary_functor_type :: vsize } ;

  const unary_functor_type & itp ;
  const shape_range_type < dimension > range ;
  
  const unary_functor_type & get_functor()
  {
    return itp ;
  }
  
  index_generator
    ( const unary_functor_type & _itp ,
      const shape_range_type < dimension > _range )
  : itp ( _itp ) ,
    range ( _range )
  { } ;

  index_generator < unary_functor_type , level >
    subrange ( const shape_range_type < dimension > range ) const
  {
    return index_generator < unary_functor_type , level >
             ( itp , range ) ;
  }
  
  index_generator < unary_functor_type , level - 1 >
    bindOuter ( const int & c ) const
  {
    auto slice_start = range[0] , slice_end = range[1] ;

    slice_start [ level ] += c ;
    slice_end [ level ] = slice_start [ level ] + 1 ;
    
    return index_generator < unary_functor_type , level - 1 >
             ( itp , shape_range_type < dimension > ( slice_start , slice_end ) ) ;
  }  
} ;

/// specialization of index_generator for level 0. Here, the indices for all higher
/// dimensions have been fixed by the hierarchical descent, and we only need to concern
/// ourselves with the index(es) for dimension 0, and supply the operator() implementations.
/// Note how we derive the concrete type of index from the functor. This way, whatever
/// the functor takes is provided with no need of type conversion, which would be necessary
/// if we'd only produce integral indices here.

template < typename unary_functor_type >
struct index_generator < unary_functor_type , 0 >
{
  typedef unary_functor_type functor_type ;
  
  typedef typename unary_functor_type::in_ele_type index_ele_type ;
  typedef typename unary_functor_type::out_type value_type ;

  enum { dimension = unary_functor_type::dim_in } ;
  typedef vigra::TinyVector < index_ele_type , dimension > index_type ;
  
  enum { vsize = unary_functor_type::vsize } ;
  
#ifdef USE_VC

  typedef typename unary_functor_type::out_v out_v ;
  typedef typename unary_functor_type::in_ele_v index_ele_v ;

  typedef vigra::TinyVector < index_ele_v , dimension > index_v ;
  index_v current_v ; // current vectorized index to feed to functor

#endif
  
  index_type current ; // singular index

  const unary_functor_type & itp ;
  const shape_range_type < dimension > range ;
  
  const unary_functor_type & get_functor()
  {
    return itp ;
  }
  
  index_generator
    ( const unary_functor_type & _itp ,
      const shape_range_type < dimension > _range
    )
  : itp ( _itp ) ,
    range ( _range )
  {
    // initially, set the singular index to the beginning of the range
    current = index_type ( range[0] ) ;
    
#ifdef USE_VC

    // vectorized processing will be done only if vsize > 1.
    // vectorized processing will process the bulk of the data, leaving
    // only a few 'stragglers' to mop up afterwards. But if vsize == 1,
    // we're using the unvectorized code as fallback, in which case
    // all values are treated as stragglers ;)

    if ( vsize > 1 )
    {
      // initialize current_v to hold the first simdized index
      for ( int d = 0 ; d < dimension ; d++ )
        current_v[d] = index_ele_v ( range[0][d] ) ;
      current_v[0] += vspline::vector_traits < index_ele_type , vsize > :: IndexesFromZero() ;
      
      // if vc is used, the singular index will only be used for mop-up action
      // after all aggregates have been processed.
      int size = range[1][0] - range[0][0] ;
      int aggregates = size / vsize ;
    }
    
#endif

  } ;
  
  /// single-value evaluation. This will be used for all values if vc isn't used,
  /// or only for mop-up action after all full vectors are processed. If operator()
  /// is called for straight value_type, this is the best matching overload.

  void operator() ( value_type & target )
  {
    itp.eval ( unwrap ( current ) , target ) ;
    current[0] += index_ele_type ( 1 ) ;
  }

#ifdef USE_VC
 
  /// vectorized evaluation. Hierarchical decent has left us with only the
  /// level0 coordinate to increase, making this code very efficient.
  /// Here we have T as a template argument. This version will only match
  /// if T is not a straight value_type, because if it were, the first operator()
  /// variant would be preferred.

  template < class T >
  void operator() ( T & target )
  {
    static_assert ( vsize > 1 , "this code must not be called for vsize == 1" ) ;
    current_v[0] = index_ele_v::IndexesFromZero() + index_ele_v ( current[0] ) ;
    itp.eval ( current_v , target ) ;
    current[0] += vsize ;
  }

#endif

  /// while we are at the lowest level here, we still need the subrange routine
  /// for cases where the data are 1D in the first place: in this situation,
  /// we need to be able to split up the range as well.

  index_generator < unary_functor_type , 0 >
    subrange ( const shape_range_type < dimension > range ) const
  {
    return index_generator < unary_functor_type , 0 >
             ( itp , range ) ;
  }
} ;

} ; // namespace detail

/// implementation of transform() by delegation to the more general fill() routine,
/// passing in the input array and the interpolator via a generator object.
/// This is a generalization of a remap routine: the remap concept looks at the incoming
/// data as coordinates, at the functor as an interpolator yielding values for coordinates,
/// and at the output as an array of thusly generated values.
/// Here, incoming and outgoing data aren't necessarily coordinates or the result of
/// an interpolation, they can be any pair of types which the functor can handle.
///
/// transform takes two template arguments:
///
/// - 'unary_functor_type', which is a class satisfying the interface laid down in
///   unary_functor.h. Typically, this would be a type inheriting from
///   vspline::unary_functor, but any type will do as long as it provides the required
///   typedefs and an the relevant eval() routines.
///
/// - the type of the output array
///
/// this overload of transform takes three parameters:
///
/// - a reference to a const unary_functor_type object providing the functionality needed
///   to generate values from coordinates.
///
/// - a reference to a const MultiArrayView holding values to feed to the unary functor
///   object. It has to have the same shape as the target array and contain data of
///   the unary_functor's in_type.
///
/// - a reference to a MultiArrayView to use as a target. This is where the resulting
///   data are put, so it has to contain data of unary_functor's out_type. It has to have
///   the same shape as the input array.

template < typename unary_functor_type  , // functor yielding values for coordinates
           typename output_type >         // type of output array
void transform ( const unary_functor_type & ev ,
                 const MultiArrayView
                     < output_type::actual_dimension ,
                       typename unary_functor_type::in_type
                     > & input ,
                 output_type & output
               )
{
  // make sure the functor's output type matches the otput array's value_type

  static_assert ( std::is_same < typename unary_functor_type::out_type ,
                                 typename output_type::value_type > :: value ,
                  "functor's output type and output's value_type must match" ) ;

  // check shape compatibility
  
  if ( output.shape() != input.shape() )
  {
    throw shape_mismatch
     ( "transform: the shapes of the input array and the output array do not match" ) ;
  }

  enum { dim_target = output_type::actual_dimension } ;
  
  // we test if the input array is unstrided in dimension 0. If that is so, even
  // if it is strided in higher dimensions, via the hierarchical descent we will
  // eventually arrive in dimension 0 and iterate over an unstrided array.
  // This only matters if Vc is used, because if the input array is unstrided,
  // the coordinates can be loaded more effectively. Note that this method
  // requires that the hierarchical access goes down all the way to 1D.
  // this test determines the type of input generator we need. With this type
  // fixed, we proceed to set up the appropriate generator object and pass
  // it to fill, together with the output array to receive the results.

  if ( input.isUnstrided ( 0 ) )
  {
    typedef detail::warp_generator < dim_target ,
                                     unary_functor_type ,
                                     false                // unstrided
                                   > gen_t ;  
    gen_t gen ( input , ev ) ;  
    detail::fill < gen_t , dim_target > ( gen , output ) ;
  }
  else
  {
    // input array is strided even in dimension 0
    typedef detail::warp_generator < dim_target ,
                                     unary_functor_type ,
                                     true > gen_t ;       // strided
    gen_t gen ( input , ev ) ;  
    detail::fill < gen_t , dim_target > ( gen , output ) ;
  }
}

/// for backward compatibility, deprecated.
/// up to vspline 0.2.1, the function above was also named 'remap', but I decided to
/// rename it to 'transform', which names it more aptly.

template < typename unary_functor_type  , // functor yielding values for coordinates
           typename output_type >         // type of output array
void remap ( const unary_functor_type & ev ,
             const MultiArrayView
                     < output_type::actual_dimension ,
                       typename unary_functor_type::in_type
                     > & input ,
             output_type & output
           )
{
  vspline::transform ( ev , input , output ) ;
}

/// we code 'apply' as a special variant of transform where the output
/// is also used as input, so the effect is to feed the unary functor
/// each 'output' value in turn, let it process it and store the result
/// back to the same location.

template < typename unary_functor_type  , // functor yielding values for coordinates
           typename output_type >         // type of output array
void apply ( const unary_functor_type & ev ,
             output_type & output )
{
  // make sure the functor's output type matches the otput array's value_type

  static_assert ( std::is_same < typename unary_functor_type::out_type ,
                                 typename output_type::value_type > :: value ,
                  "functor's value_type and array's value_type must match" ) ;

  // make sure the functor's input and output type are the same

  static_assert ( std::is_same < typename unary_functor_type::in_type ,
                                 typename unary_functor_type::out_type > :: value ,
                  "functor's input and output type must match" ) ;

  transform ( ev , output , output ) ;
}

/// Implementation of 'classic' remap, which directly takes an array of values and remaps
/// it, internally creating a b-spline of given order just for the purpose. This is used for
/// one-shot remaps where the spline isn't reused, and specific to b-splines, since
/// the functor used is a b-spline evaluator. The spline defaults to a cubic b-spline
/// with mirroring on the bounds.
///
/// So here we have the 'classic' remap, where the input array holds coordinates and
/// the functor used is actually an interpolator. Since this is merely a special case
/// of using transform(), we delegate to transform().

template < typename input_type ,
           typename warp_type ,
           typename output_type >
void remap ( const input_type & input ,
             const warp_type & warp ,
             output_type & output ,
             bcv_type < input_type::actual_dimension > bcv
              = bcv_type < input_type::actual_dimension > ( MIRROR ) ,
            int degree = 3 )
{
  // fix the type for coordinates
  
  typedef typename warp_type::value_type coordinate_type ;
  
  // fix the type for values/coefficients
  
  typedef typename input_type::value_type value_type ;
  
  static_assert ( std::is_same < typename input_type::value_type ,
                                 typename output_type::value_type > :: value ,
                  "input and output array's value_type must match" ) ;

  static_assert ( warp_type::actual_dimension == output_type::actual_dimension ,
                  "warp aray's and output array's dimension must match" ) ;
                  
  enum { dim_in = input_type::actual_dimension } ;

  static_assert ( dim_in == coordinate_type::static_size ,
                  "warp array must contain values with same dimension as input array" ) ;

  // check shape compatibility
  
  if ( output.shape() != warp.shape() )
  {
    throw shape_mismatch 
    ( "the shapes of the warp array and the output array must match" ) ;
  }

  // create the bspline object
  // TODO may want to specify tolerance here instead of using default
  
  bspline < value_type , dim_in > bsp ( input.shape() , degree , bcv ) ;
  
  // prefilter, taking data in 'input' as knot point data
  
  bsp.prefilter ( input ) ;

  // create an evaluator over the bspline

  typedef evaluator < coordinate_type , value_type > evaluator_type ;
  
  evaluator_type ev ( bsp ) ;
  
  // and call transform(), passing in the evaluator,
  // the coordinate array and the target array
  
  transform ( ev , warp , output ) ;
}

/// this overload of transform() is very similar to the previous one, but instead of
/// picking input from an array, it feeds the discrete coordinates to the successive
/// places data should be rendered to to the unary_functor_type object.
///
/// this transform overload takes one template argument:
///
/// - 'unary_functor_type', which is a class satisfying the interface laid down in
///   unary_functor.h. This is an object which can provide values given *discrete*
///   coordinates, like class evaluator, but generalized to allow for arbitrary ways
///   of achieving it's goal. The unary functor's in_type determines the number of
///   dimensions of the indices - since they are indices into the target array, the
///   functor's input type has to have the same number of dimensions as the target.
///
/// it takes two parameters:
///
/// - a reference to a const unary_functor_type object providing the functionality needed
///   to generate values from discrete coordinates
///
/// - a reference to a MultiArrayView to use as a target. This is where the resulting
///   data are put.

template < class unary_functor_type >
void transform ( const unary_functor_type & ev ,
                 MultiArrayView < unary_functor_type::dim_in ,
                                  typename unary_functor_type::out_type > & output )
{
  enum { dim_target = unary_functor_type::dim_in } ;
  
  typedef typename unary_functor_type::out_type value_type ;
  typedef TinyVector < int , dim_target > nd_ic_type ;
  typedef detail::index_generator < unary_functor_type , dim_target - 1 > gen_t ;

  shape_range_type < dim_target > range ( nd_ic_type() , output.shape() ) ;  
  gen_t gen ( ev , range ) ;  
  detail::fill < gen_t , dim_target > ( gen , output ) ;
}

/// for backward compatibility, deprecated
/// up to vspline 0.2.1, the function above was named 'index_remap', but I decided to
/// rename it 'transform', which is more apt.

template < class unary_functor_type >
void index_remap( const unary_functor_type & ev ,
                  MultiArrayView < unary_functor_type::dim_in ,
                                   typename unary_functor_type::out_type > & output )
{
  transform ( ev , output ) ;
}

namespace detail // workhorse code for grid_eval
{
// in grid_weight, for every dimension we have a set of ORDER weights
// for every position in this dimension. in grid_ofs, we have the
// partial offset for this dimension for every position. these partial
// offsets are the product of the index for this dimension at the position
// and the stride for this dimension, so that the sum of the partial
// offsets for all dimensions yields the offset into the coefficient array
// to the window of coefficients where the weights are to be applied.

template < typename evaluator_type , int level , int _vsize = 0 >
struct _grid_eval
{
  typedef typename evaluator_type::ele_type weight_type ;
  typedef MultiArrayView < level + 1 , typename evaluator_type::value_type > target_type ;
  
  void operator() ( int initial_ofs ,
                    MultiArrayView < 2 , weight_type > & weight ,
                    weight_type** const & grid_weight ,
                    const int & ORDER ,
                    int ** const & grid_ofs ,
                    const evaluator_type & itp ,
                    target_type & result )
  {
    for ( int ofs = 0 ; ofs < result.shape ( level ) ; ofs++ )
    {
      for ( int e = 0 ; e < ORDER ; e++ )
        weight [ vigra::Shape2 ( e , level ) ] = grid_weight [ level ] [ ORDER * ofs + e ] ;
      int cum_ofs = initial_ofs + grid_ofs [ level ] [ ofs ] ;
      auto region = result.bindAt ( level , ofs ) ;
      _grid_eval < evaluator_type , level - 1 , evaluator_type::vsize >()
        ( cum_ofs , weight , grid_weight , ORDER , grid_ofs , itp , region ) ;
    }
  }
} ;

/// Here, with template argument _vsize unfixed, we have the vector code,
/// below is a specialization for _vsize == 1 which is unvectorized.

template < typename evaluator_type , int _vsize >
struct _grid_eval < evaluator_type , 0 , _vsize >
{
  typedef typename evaluator_type::ele_type weight_type ;
  typedef MultiArrayView < 1 , typename evaluator_type::value_type > target_type ;

  // on my system, using clang++, the vectorized code is slightly slower
  // than the unvectorized code. With g++, the vectorized code is faster
  // than either clang version, but the unvectorized code is much slower.

  enum { vsize = evaluator_type::vsize } ;
  enum { channels = evaluator_type::channels } ;
  typedef typename evaluator_type::value_type value_type ;
  typedef typename evaluator_type::ele_type ele_type ;
  typedef typename evaluator_type::ic_v ic_v ;
  typedef typename evaluator_type::ele_v ele_v ;
  typedef typename evaluator_type::out_v mc_ele_v ;
  typedef typename evaluator_type::out_v out_v ;
  typedef typename ele_v::IndexType index_type ;

  inline void _scatter ( const out_v & src ,
                        ele_type * dp ,
                        index_type indexes ,
                        std::true_type
                      )
  {
    src.scatter ( dp , indexes ) ;
  }

  inline void _scatter ( const out_v & src ,
                        ele_type * dp ,
                        index_type indexes ,
                        std::false_type
                      )
  {
    for ( int e = 0 ; e < channels ; e++ )
      src[e].scatter ( dp + e , indexes ) ;
  }

  inline void scatter ( const out_v & src ,
                        ele_type * dp ,
                        index_type indexes
                      )
  {
    _scatter ( src , dp , indexes ,
                typename std::is_same < ele_v , out_v > :: type ()
    ) ;
  }
    
  void operator() ( int initial_ofs ,
                    MultiArrayView < 2 , weight_type > & weight ,
                    weight_type** const & grid_weight ,
                    const int & ORDER ,
                    int ** const & grid_ofs ,
                    const evaluator_type & itp ,
                    target_type & region )
  {
    auto iter = region.begin() ;    
    int ofs_start = 0 ;

    // number of vectorized results
    int aggregates = region.size() / vsize ;
    // vectorized weights
    MultiArray < 2 , ele_v > vweight ( weight.shape() ) ;
    // vectorized offset
    ic_v select ;
    // buffer for target data
    mc_ele_v vtarget ;

    // initialize the vectorized weights for dimensions > 0
    for ( int d = 1 ; d < weight.shape(1) ; d++ )
    {
      for ( int o = 0 ; o < ORDER ; o++ )
        vweight [ vigra::Shape2 ( o , d ) ] = weight [ vigra::Shape2 ( o , d ) ] ;
    }

    // get a pointer to the target array's data (seen as elementary type)
    ele_type * p_target = (ele_type*) ( region.data() ) ;
    // and the stride, if any, also in terms of the elementary type, from
    // one cluster of target data to the next
    int stride = vsize * channels * region.stride(0) ;

    for ( int a = 0 ; a < aggregates ; a++ )
    {
      // gather the individual weights into the vectorized form
      for ( int o = 0 ; o < ORDER ; o++ )
      {
        vweight[ vigra::Shape2 ( o , 0 ) ].gather
          ( grid_weight [ 0 ] + ORDER * a * vsize ,
            ORDER * ic_v::IndexesFromZero() + o ) ;
      }
      select.load ( grid_ofs [ 0 ] + a * vsize ) ; // get the offsets from grid_ofs
      select += initial_ofs ; // add cumulated offsets from higher dimensions
      
      // now we can call the vectorized eval routine
      itp.eval ( select , vweight , vtarget ) ;
      
      // finally we scatter the vectorized result to target memory
      scatter ( vtarget , p_target ,
                ic_v::IndexesFromZero() * channels * region.stride(0) ) ;

      // and set p_target to the next cluster of target values
      p_target += stride ;
    }
    
    // adapt the iterator into target array
    iter += aggregates * vsize ;
    // and the initial offset
    ofs_start += aggregates * vsize ;

    // now we finish off the stragglers:
    for ( int ofs = ofs_start ; ofs < region.shape ( 0 ) ; ofs++ )
    {
      for ( int e = 0 ; e < ORDER ; e++ )
        weight [ vigra::Shape2 ( e , 0 )  ] = grid_weight [ 0 ] [ ORDER * ofs + e ] ;
      int cum_ofs = initial_ofs + grid_ofs [ 0 ] [ ofs ] ;
      itp.eval ( cum_ofs , weight , *iter ) ;
      ++iter ;
    }
  }
} ;

template < typename evaluator_type >
struct _grid_eval < evaluator_type , 0 , 1 >
{
  typedef typename evaluator_type::ele_type weight_type ;
  typedef MultiArrayView < 1 , typename evaluator_type::value_type > target_type ;

  
  void operator() ( int initial_ofs ,
                    MultiArrayView < 2 , weight_type > & weight ,
                    weight_type** const & grid_weight ,
                    const int & ORDER ,
                    int ** const & grid_ofs ,
                    const evaluator_type & itp ,
                    target_type & region )
  {
    auto iter = region.begin() ;    
    int ofs_start = 0 ;

    // if Vc wasn't used, we start with ofs = 0 and this loop
    // does all the processing:
    for ( int ofs = ofs_start ; ofs < region.shape ( 0 ) ; ofs++ )
    {
      for ( int e = 0 ; e < ORDER ; e++ )
        weight [ vigra::Shape2 ( e , 0 )  ] = grid_weight [ 0 ] [ ORDER * ofs + e ] ;
      int cum_ofs = initial_ofs + grid_ofs [ 0 ] [ ofs ] ;
      itp.eval ( cum_ofs , weight , *iter ) ;
      ++iter ;
    }
  }
} ;

/// Here is the single-threaded code for the grid_eval function.
/// The first argument is a shape range, defining the subsets of data
/// to process in a single thread. the remainder are forwards of the
/// arguments to grid_eval, partly as pointers. The call is affected
/// via 'multithread()' which sets up the partitioning and distribution
/// to threads from a thread pool.

template < typename evaluator_type , // b-spline evaluator type
           int dim_out >             // dimension of target
void st_grid_eval ( shape_range_type < dim_out > range ,
                    typename evaluator_type::rc_type ** const _grid_coordinate ,
                    const evaluator_type * itp ,
                    MultiArrayView < dim_out , typename evaluator_type::value_type >
                      * p_result )
{
  typedef typename evaluator_type::ele_type weight_type ;
  typedef typename evaluator_type::rc_type rc_type ;
  typedef MultiArrayView < dim_out , typename evaluator_type::value_type > target_type ;
  
  const int ORDER = itp->get_order() ;
  
  // pick the subarray of the 'whole' target array pertaining to this thread's range
  auto result = p_result->subarray ( range[0] , range[1] ) ;
  
  // pick the subset of coordinates pertaining to this thread's range
  const rc_type * grid_coordinate [ dim_out ] ;
  for ( int d = 0 ; d < dim_out ; d++ )
    grid_coordinate[d] = _grid_coordinate[d] + range[0][d] ;

  // set up storage for precalculated weights and offsets

  weight_type * grid_weight [ dim_out ] ;
  int * grid_ofs [ dim_out ] ;
  
  // get some metrics
  TinyVector < int , dim_out > shape ( result.shape() ) ;
  TinyVector < int , dim_out > estride ( itp->get_estride() ) ;
  
  // allocate space for the per-axis weights and offsets
  for ( int d = 0 ; d < dim_out ; d++ )
  {
    grid_weight[d] = new weight_type [ ORDER * shape [ d ] ] ;
    grid_ofs[d] = new int [ shape [ d ] ] ;
  }
  
  int select ;
  rc_type tune ;
  
  // fill in the weights and offsets, using the interpolator's split() to split
  // the coordinates received in grid_coordinate, the interpolator's obtain_weights
  // method to produce the weight components, and the strides of the coefficient
  // array to convert the integral parts of the coordinates into offsets.

  for ( int d = 0 ; d < dim_out ; d++ )
  {
    for ( int c = 0 ; c < shape [ d ] ; c++ )
    {
      itp->split ( grid_coordinate [ d ] [ c ] , select , tune ) ; 
      itp->obtain_weights ( grid_weight [ d ] + ORDER * c , d , tune ) ;
      grid_ofs [ d ] [ c ] = select * estride [ d ] ;
    }
  }
  
  // allocate storage for a set of singular weights
  MultiArray < 2 , weight_type > weight ( vigra::Shape2 ( ORDER , dim_out ) ) ;
  
  // now call the recursive workhorse routine
  detail::_grid_eval < evaluator_type , dim_out - 1 , evaluator_type::vsize >()
   ( 0 , weight , grid_weight , ORDER , grid_ofs , *itp , result ) ;

  // clean up
  for ( int d = 0 ; d < dim_out ; d++ )
  {
    delete[] grid_weight[d] ;
    delete[] grid_ofs[d] ;
  }
  
}

} ; // end of namespace detail

/// this is the multithreaded version of grid_eval, which sets up the
/// full range over 'result' and calls 'multithread' to do the rest
///
/// grid_eval evaluates a b-spline object
/// at points whose coordinates are distributed in a grid, so that for
/// every axis there is a set of as many coordinates as this axis is long,
/// which will be used in the grid as the coordinate for this axis at the
/// corresponding position. The resulting coordinate matrix (which remains
/// implicit) is like a mesh grid made from the per-axis coordinates.
///
/// If we have two dimensions and x coordinates x0, x1 and x2, and y
/// coordinates y0 and y1, the resulting implicit coordinate matrix is
///
/// (x0,y0) (x1,y0) (x2,y0)
///
/// (x0,y1) (x1,y1) (x2,y1)
///
/// since the offsets and weights needed to perform an interpolation
/// only depend on the coordinates, this highly redundant coordinate array
/// can be processed more efficiently by precalculating the offset component
/// and weight component for all axes and then simply permutating them to
/// obtain the result. Especially for higher-degree and higher-dimensional
/// splines this saves quite some time, since the generation of weights
/// is computationally expensive.
///
/// grid_eval is useful for generating a scaled representation of the original
/// data, but when scaling down, aliasing will occur and the data should be
/// low-pass-filtered adequately before processing. Let me hint here that
/// low-pass filtering can be achieved by using b-spline reconstruction on
/// raw data (a 'smoothing spline') - or by prefiltering with exponential
/// smoothing, which can be activated by passing the 'smoothing' parameter
/// to the prefiltering routine. Of course any other way of smoothing can
/// be used just the same, like a Burt filter or Gaussian smoothing.
///
/// Note that this code is specific to b-spline evaluators and relies
/// on evaluator_type offering several b-spline specific methods which
/// are not present in other interpolators, like split() and
/// obtain_weights(). Since the weight generation for b-splines can
/// be done separately for each axis and is a computationally intensive
/// task, precalculating these per-axis weights makes sense. Coding for
/// the general case (other interpolators), the only achievement would be
/// the permutation of the partial coordinates, so little would be gained,
/// and instead a transform where the indices are used to pick up
/// the coordinates can be written easily: have a unary_functor taking
/// discrete coordinates, 'loaded' with the per-axis coordinates, and an
/// eval routine yielding the picked coordinates.

template < typename evaluator_type , // b-spline evaluator
           int dim_out >             // dimension of target
void grid_eval ( typename evaluator_type::rc_type ** const grid_coordinate ,
                 const evaluator_type & itp ,
                 MultiArrayView < dim_out , typename evaluator_type::value_type >
                   & result )
{
  shape_range_type < dim_out > range ( shape_type < dim_out > () , result.shape() ) ;
  multithread ( detail::st_grid_eval < evaluator_type , dim_out > ,
                vspline::partition_to_tiles < dim_out > ,
                ncores * 8 ,
                range ,
                grid_coordinate ,
                &itp ,
                &result ) ;
}

/// grid_eval allows us to code a function to restore the original knot point
/// date from a bspline. We simply fill in the discrete coordinates into the
/// grid coordinate vectors and call grid_eval with them.
/// note that this routine can't operate in-place, so you can't overwrite
/// a bspline object's core with the restored knot point data, you have to
/// provide a separate target array.
/// This routine is potentially faster than running an transform with
/// the same target, due to the precalculated weight components. For 1D data,
/// a transform is used, because here we'd just precalculate a weight for
/// each individual value, which would actually be slower.

template < int dimension ,
           typename value_type ,
           typename rc_type = float >
void restore
  ( const vspline::bspline < value_type , dimension > & bspl ,
    vigra::MultiArrayView < dimension , value_type > & target )
{
  if ( target.shape() != bspl.core.shape() )
    throw shape_mismatch
     ( "restore: spline's core shape and target array shape must match" ) ;
    
  typedef vigra::TinyVector < rc_type , dimension > coordinate_type ;
  typedef vigra::MultiArrayView < dimension , value_type > target_type ;
  typedef typename vigra::ExpandElementResult < value_type > :: type weight_type ;
  
  typedef vspline::evaluator < coordinate_type , value_type > ev_type ;
  ev_type ev ( bspl ) ;
  
  // TODO: might catch cases with spline degree < 2 where data can be
  // simply copied - or not even that, if source == target
  // for now we unconditionally give the caller a 'proper' restore.

  if ( dimension == 1 )
  {
    // for 1D splines, it's futile to do a grid_eval
    vspline::transform ( ev , target ) ;
  }
  else
  {
    // set up the coordinate component vectors
    rc_type * p_ruler [ dimension ] ;
    for ( int d = 0 ; d < dimension ; d++ )
    {
      p_ruler[d] = new rc_type [ target.shape ( d ) ] ;
      for ( int i = 0 ; i < target.shape ( d ) ; i++ )
        p_ruler[d][i] = rc_type(i) ;
    }
    
    vspline::grid_eval < ev_type , dimension > // target_type , weight_type , rc_type >
      ( p_ruler , ev , target ) ;

    for ( int d = 0 ; d < dimension ; d++ )
      delete[] p_ruler[d] ;
  }
}

} ; // end of namespace vspline

#endif // VSPLINE_TRANSFORM_H
