#if USE_ITT_BUILD
/*
 * kmp_itt.h -- ITT Notify interface.
 */


//===----------------------------------------------------------------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.txt for details.
//
//===----------------------------------------------------------------------===//


#ifndef KMP_ITT_H
#define KMP_ITT_H

#include "kmp_lock.h"

#define INTEL_ITTNOTIFY_API_PRIVATE
#include "ittnotify.h"
#include "legacy/ittnotify.h"

#if KMP_DEBUG
    #define __kmp_inline           // Turn off inlining in debug mode.
#else
    #define __kmp_inline static inline
#endif

#if USE_ITT_NOTIFY
    extern kmp_int32  __kmp_itt_prepare_delay;
# ifdef __cplusplus
    extern "C" void __kmp_itt_fini_ittlib(void);
# else
    extern void __kmp_itt_fini_ittlib(void);
# endif
#endif

// Simplify the handling of an argument that is only required when USE_ITT_BUILD is enabled.
#define USE_ITT_BUILD_ARG(x) ,x

void __kmp_itt_initialize();
void __kmp_itt_destroy();

// -------------------------------------------------------------------------------------------------
// New stuff for reporting high-level constructs.
// -------------------------------------------------------------------------------------------------

// Note the naming convention:
//     __kmp_itt_xxxing() function should be called before action, while
//     __kmp_itt_xxxed()  function should be called after action.

// --- Parallel region reporting ---
__kmp_inline void __kmp_itt_region_forking(  int gtid, int team_size, int barriers, int serialized = 0 ); // Master only, before forking threads.
__kmp_inline void __kmp_itt_region_joined(   int gtid, int serialized = 0 ); // Master only, after joining threads.
    // (*) Note: A thread may execute tasks after this point, though.

// --- Frame reporting ---
// region = 0 - no regions, region = 1 - parallel, region = 2 - serialized parallel
__kmp_inline void __kmp_itt_frame_submit( int gtid, __itt_timestamp begin, __itt_timestamp end, int imbalance, ident_t *loc, int team_size, int region = 0 );

// --- Metadata reporting ---
// begin/end - begin/end timestamps of a barrier frame, imbalance - aggregated wait time value, reduction -if this is a reduction barrier
__kmp_inline void __kmp_itt_metadata_imbalance( int gtid, kmp_uint64 begin, kmp_uint64 end, kmp_uint64 imbalance, kmp_uint64 reduction );
// sched_type: 0 - static, 1 - dynamic, 2 - guided, 3 - custom (all others); iterations - loop trip count, chunk - chunk size
__kmp_inline void __kmp_itt_metadata_loop( ident_t * loc, kmp_uint64 sched_type, kmp_uint64 iterations, kmp_uint64 chunk );
__kmp_inline void __kmp_itt_metadata_single( ident_t * loc );

// --- Barrier reporting ---
__kmp_inline void * __kmp_itt_barrier_object( int gtid, int bt, int set_name = 0, int delta = 0 );
__kmp_inline void   __kmp_itt_barrier_starting( int gtid, void * object );
__kmp_inline void   __kmp_itt_barrier_middle(   int gtid, void * object );
__kmp_inline void   __kmp_itt_barrier_finished( int gtid, void * object );

// --- Taskwait reporting ---
__kmp_inline void * __kmp_itt_taskwait_object( int gtid );
__kmp_inline void   __kmp_itt_taskwait_starting( int gtid, void * object );
__kmp_inline void   __kmp_itt_taskwait_finished(   int gtid, void * object );

// --- Task reporting ---
__kmp_inline void   __kmp_itt_task_starting( void * object );
__kmp_inline void   __kmp_itt_task_finished( void * object );

// --- Lock reporting ---
#if KMP_USE_DYNAMIC_LOCK
__kmp_inline void   __kmp_itt_lock_creating(  kmp_user_lock_p lock, const ident_t * );
#else
__kmp_inline void   __kmp_itt_lock_creating(  kmp_user_lock_p lock );
#endif
__kmp_inline void   __kmp_itt_lock_acquiring( kmp_user_lock_p lock );
__kmp_inline void   __kmp_itt_lock_acquired(  kmp_user_lock_p lock );
__kmp_inline void   __kmp_itt_lock_releasing( kmp_user_lock_p lock );
__kmp_inline void   __kmp_itt_lock_cancelled( kmp_user_lock_p lock );
__kmp_inline void   __kmp_itt_lock_destroyed( kmp_user_lock_p lock );

// --- Critical reporting ---
#if KMP_USE_DYNAMIC_LOCK
__kmp_inline void   __kmp_itt_critical_creating(  kmp_user_lock_p lock, const ident_t * );
#else
__kmp_inline void   __kmp_itt_critical_creating(  kmp_user_lock_p lock );
#endif
__kmp_inline void   __kmp_itt_critical_acquiring( kmp_user_lock_p lock );
__kmp_inline void   __kmp_itt_critical_acquired(  kmp_user_lock_p lock );
__kmp_inline void   __kmp_itt_critical_releasing( kmp_user_lock_p lock );
__kmp_inline void   __kmp_itt_critical_destroyed( kmp_user_lock_p lock );

// --- Single reporting ---
__kmp_inline void   __kmp_itt_single_start( int gtid );
__kmp_inline void   __kmp_itt_single_end(   int gtid );

// --- Ordered reporting ---
__kmp_inline void   __kmp_itt_ordered_init(  int gtid );
__kmp_inline void   __kmp_itt_ordered_prep(  int gtid );
__kmp_inline void   __kmp_itt_ordered_start( int gtid );
__kmp_inline void   __kmp_itt_ordered_end(   int gtid );

// --- Threads reporting ---
__kmp_inline void  __kmp_itt_thread_ignore();
__kmp_inline void  __kmp_itt_thread_name( int gtid );

// --- System objects ---
__kmp_inline void   __kmp_itt_system_object_created( void * object, char const * name );

// --- Stack stitching ---
__kmp_inline __itt_caller __kmp_itt_stack_caller_create(void);
__kmp_inline void __kmp_itt_stack_caller_destroy(__itt_caller);
__kmp_inline void __kmp_itt_stack_callee_enter(__itt_caller);
__kmp_inline void __kmp_itt_stack_callee_leave(__itt_caller);

// -------------------------------------------------------------------------------------------------
// Old stuff for reporting low-level internal synchronization.
// -------------------------------------------------------------------------------------------------

#if USE_ITT_NOTIFY

    /*
     * Support for SSC marks, which are used by SDE
     * http://software.intel.com/en-us/articles/intel-software-development-emulator
     * to mark points in instruction traces that represent spin-loops and are
     * therefore uninteresting when collecting traces for architecture simulation.
     */
    #ifndef INCLUDE_SSC_MARKS
    # define INCLUDE_SSC_MARKS (KMP_OS_LINUX && KMP_ARCH_X86_64)
    #endif

    /* Linux 64 only for now */
    #if (INCLUDE_SSC_MARKS && KMP_OS_LINUX && KMP_ARCH_X86_64)
    // Portable (at least for gcc and icc) code to insert the necessary instructions
    // to set %ebx and execute the unlikely no-op.
      #if defined( __INTEL_COMPILER )
      # define INSERT_SSC_MARK(tag) __SSC_MARK(tag)
      #else
      # define INSERT_SSC_MARK(tag)                                          \
      __asm__ __volatile__ ("movl %0, %%ebx; .byte 0x64, 0x67, 0x90 " ::"i"(tag):"%ebx")
      #endif
    #else
    # define INSERT_SSC_MARK(tag) ((void)0)
    #endif

    /* Markers for the start and end of regions that represent polling and
     * are therefore uninteresting to architectural simulations 0x4376 and
     * 0x4377 are arbitrary numbers that should be unique in the space of
     * SSC tags, but there is no central issuing authority rather
     * randomness is expected to work.
     */
    #define SSC_MARK_SPIN_START() INSERT_SSC_MARK(0x4376)
    #define SSC_MARK_SPIN_END()   INSERT_SSC_MARK(0x4377)

    // Markers for architecture simulation.
    // FORKING      : Before the master thread forks.
    // JOINING      : At the start of the join.
    // INVOKING     : Before the threads invoke microtasks.
    // DISPATCH_INIT: At the start of dynamically scheduled loop.
    // DISPATCH_NEXT: After claming next iteration of dynamically scheduled loop.
    #define SSC_MARK_FORKING()          INSERT_SSC_MARK(0xd693)
    #define SSC_MARK_JOINING()          INSERT_SSC_MARK(0xd694)
    #define SSC_MARK_INVOKING()         INSERT_SSC_MARK(0xd695)
    #define SSC_MARK_DISPATCH_INIT()    INSERT_SSC_MARK(0xd696)
    #define SSC_MARK_DISPATCH_NEXT()    INSERT_SSC_MARK(0xd697)

    // The object is an address that associates a specific set of the prepare, acquire, release,
    // and cancel operations.

    /* Sync prepare indicates a thread is going to start waiting for another thread
       to send a release event.  This operation should be done just before the thread
       begins checking for the existence of the release event */

    /* Sync cancel indicates a thread is cancelling a wait on another thread anc
       continuing execution without waiting for the other thread to release it */

    /* Sync acquired indicates a thread has received a release event from another
       thread and has stopped waiting.  This operation must occur only after the release
       event is received. */

    /* Sync release indicates a thread is going to send a release event to another thread
       so it will stop waiting and continue execution. This operation must just happen before
       the release event. */

    #define KMP_FSYNC_PREPARE(   obj )  __itt_fsync_prepare(   (void *)( obj ) )
    #define KMP_FSYNC_CANCEL(    obj )  __itt_fsync_cancel(    (void *)( obj ) )
    #define KMP_FSYNC_ACQUIRED(  obj )  __itt_fsync_acquired(  (void *)( obj ) )
    #define KMP_FSYNC_RELEASING( obj )  __itt_fsync_releasing( (void *)( obj ) )

    /*
        In case of waiting in a spin loop, ITT wants KMP_FSYNC_PREPARE() to be called with a delay
        (and not called at all if waiting time is small). So, in spin loops, do not use
        KMP_FSYNC_PREPARE(), but use KMP_FSYNC_SPIN_INIT() (before spin loop),
        KMP_FSYNC_SPIN_PREPARE() (whithin the spin loop), and KMP_FSYNC_SPIN_ACQUIRED().
        See KMP_WAIT_YIELD() for example.
    */

    #undef  KMP_FSYNC_SPIN_INIT
    #define KMP_FSYNC_SPIN_INIT( obj, spin )    \
        int sync_iters = 0;                     \
        if ( __itt_fsync_prepare_ptr ) {        \
            if ( obj == NULL ) {                \
                obj = spin;                     \
            } /* if */                          \
        } /* if */                              \
        SSC_MARK_SPIN_START()

    #undef  KMP_FSYNC_SPIN_PREPARE
    #define KMP_FSYNC_SPIN_PREPARE( obj ) do {                          \
        if ( __itt_fsync_prepare_ptr && sync_iters < __kmp_itt_prepare_delay ) { \
            ++ sync_iters;                                              \
            if ( sync_iters >= __kmp_itt_prepare_delay ) {              \
                KMP_FSYNC_PREPARE( (void*) obj );                       \
            } /* if */                                                  \
        } /* if */                                                      \
     } while (0)
    #undef  KMP_FSYNC_SPIN_ACQUIRED
    #define KMP_FSYNC_SPIN_ACQUIRED( obj ) do {         \
        SSC_MARK_SPIN_END();                            \
        if ( sync_iters >= __kmp_itt_prepare_delay ) {  \
            KMP_FSYNC_ACQUIRED( (void*) obj );          \
        } /* if */                                      \
     } while (0)

    /* ITT will not report objects created within KMP_ITT_IGNORE(), e. g.:
           KMP_ITT_IGNORE(
               ptr = malloc( size );
           );
    */
    #define KMP_ITT_IGNORE( statement ) do {                            \
            __itt_state_t __itt_state_;                                 \
            if ( __itt_state_get_ptr ) {                                \
                __itt_state_ = __itt_state_get();                       \
                __itt_obj_mode_set( __itt_obj_prop_ignore, __itt_obj_state_set ); \
            }  /* if */                                                 \
            { statement }                                               \
            if ( __itt_state_get_ptr ) {                                \
                __itt_state_set( __itt_state_ );                        \
            }  /* if */                                                 \
    } while (0)

    const int KMP_MAX_FRAME_DOMAINS = 512; // Maximum number of frame domains to use (maps to
                                           // different OpenMP regions in the user source code).
    extern kmp_int32 __kmp_barrier_domain_count;
    extern kmp_int32 __kmp_region_domain_count;
    extern __itt_domain* __kmp_itt_barrier_domains[KMP_MAX_FRAME_DOMAINS];
    extern __itt_domain* __kmp_itt_region_domains[KMP_MAX_FRAME_DOMAINS];
    extern __itt_domain* __kmp_itt_imbalance_domains[KMP_MAX_FRAME_DOMAINS];
    extern kmp_int32 __kmp_itt_region_team_size[KMP_MAX_FRAME_DOMAINS];
    extern __itt_domain * metadata_domain;

#else

// Null definitions of the synchronization tracing functions.
# define KMP_FSYNC_PREPARE(   obj )        ((void)0)
# define KMP_FSYNC_CANCEL(    obj )        ((void)0)
# define KMP_FSYNC_ACQUIRED(  obj )        ((void)0)
# define KMP_FSYNC_RELEASING( obj )        ((void)0)

# define KMP_FSYNC_SPIN_INIT( obj, spin )  ((void)0)
# define KMP_FSYNC_SPIN_PREPARE(  obj )    ((void)0)
# define KMP_FSYNC_SPIN_ACQUIRED( obj )    ((void)0)

# define KMP_ITT_IGNORE(stmt ) do { stmt } while (0)

#endif // USE_ITT_NOTIFY

#if ! KMP_DEBUG
    // In release mode include definitions of inline functions.
    #include "kmp_itt.inl"
#endif

#endif // KMP_ITT_H

#else  /* USE_ITT_BUILD */

// Null definitions of the synchronization tracing functions.
// If USE_ITT_BULID is not enabled, USE_ITT_NOTIFY cannot be either.
// By defining these we avoid unpleasant ifdef tests in many places.
# define KMP_FSYNC_PREPARE(   obj )        ((void)0)
# define KMP_FSYNC_CANCEL(    obj )        ((void)0)
# define KMP_FSYNC_ACQUIRED(  obj )        ((void)0)
# define KMP_FSYNC_RELEASING( obj )        ((void)0)

# define KMP_FSYNC_SPIN_INIT( obj, spin )  ((void)0)
# define KMP_FSYNC_SPIN_PREPARE(  obj )    ((void)0)
# define KMP_FSYNC_SPIN_ACQUIRED( obj )    ((void)0)

# define KMP_ITT_IGNORE(stmt ) do { stmt } while (0)

# define USE_ITT_BUILD_ARG(x)

#endif /* USE_ITT_BUILD */