/*
 * Copyright (C) 2021 Intel Corporation
 *
 * This software and the related documents are Intel copyrighted materials, and your use of them
 * is governed by the express license under which they were provided to you ("License"). Unless
 * the License provides otherwise, you may not use, modify, copy, publish, distribute, disclose
 * or transmit this software or the related documents without Intel's prior written permission.
 *
 * This software and the related documents are provided as is, with no express or implied
 * warranties, other than those that are expressly stated in the License.
*/

#define EU_PER_SUBSLICE 16
#define SUBGROUP_SIZE 8
#define LOCAL_SIZE 512
#define THREADS_PER_SUBSLICE 128
#define ELEMENTS_PER_THREAD 375

__attribute__((intel_reqd_sub_group_size(SUBGROUP_SIZE)))
__kernel void GlobalAdd6(__global float *const pa, int size)
{
    // map each HW thread to their own memory slice
    const int id = (((get_group_id(0) % EU_PER_SUBSLICE) * LOCAL_SIZE
        + get_sub_group_id() * SUBGROUP_SIZE) % THREADS_PER_SUBSLICE) * ELEMENTS_PER_THREAD;
    float a = 1.f;

    for (int count = 0; count < 256; count++)
    {
        a += pa[id + count]*id;
    }
        
    // This branch never be executed, but we need this code
    // to prevent compiler from optimizing out loop above
    if (!a)
    {
        pa[id] = a;
    }
}
