__shared__ int s_temp_H[8][64];
__shared__ int s_temp_F[8][64];

__shared__ int s_query[2*256];

__shared__ int flag[8][32];

__device__
void reduction(int &r_Max){
  s_temp_H[threadIdx.y][threadIdx.x] = r_Max;
  //__syncthreads();
  s_temp_H[threadIdx.y][threadIdx.x]
    = max(s_temp_H[threadIdx.y][threadIdx.x], s_temp_H[threadIdx.y][threadIdx.x + 16]);
  //__syncthreads();
  s_temp_H[threadIdx.y][threadIdx.x]
    = max(s_temp_H[threadIdx.y][threadIdx.x], s_temp_H[threadIdx.y][threadIdx.x + 8]);
  //__syncthreads();
  s_temp_H[threadIdx.y][threadIdx.x]
    = max(s_temp_H[threadIdx.y][threadIdx.x], s_temp_H[threadIdx.y][threadIdx.x + 4]);
  //__syncthreads();
  s_temp_H[threadIdx.y][threadIdx.x]
    = max(s_temp_H[threadIdx.y][threadIdx.x], s_temp_H[threadIdx.y][threadIdx.x + 2]);
  //__syncthreads();
  r_Max
    = max(s_temp_H[threadIdx.y][threadIdx.x], s_temp_H[threadIdx.y][threadIdx.x + 1]);
}

template <int BLOCK_DIM_X, int BLOCK_DIM_Y>
__device__
void kernelSmithWatermanMostInnerLoopUnrolled(int  r_query_offset, char subject_idx, 
					      int &r_Max, int &r_E, int &r_H_Upper){

  char query_idx;
  int r_H, r_F;
  MATRIX_ELEMENT score;

  for(int i=0; i<BLOCK_DIM_X; i+= 16){
    query_idx = s_query[r_query_offset + threadIdx.x + i + 0];score = tex2D(texScoreMatrix, query_idx, subject_idx);r_F = s_temp_F[threadIdx.y][threadIdx.x + i + 0] - Gap_extend;r_F = max(r_F, s_temp_H[threadIdx.y][threadIdx.x + i + 0] - Gap_init);r_H = max(r_F,  r_E);r_H = max(r_H, r_H_Upper + score);r_H_Upper = s_temp_H[threadIdx.y][threadIdx.x + i + 0];r_H = max(r_H, 0);r_Max = max(r_H, r_Max);s_temp_F[threadIdx.y][threadIdx.x + i + 0] = r_F;s_temp_H[threadIdx.y][threadIdx.x + i + 0] = r_H;r_E -= Gap_extend;r_E = max(r_E, r_H - Gap_init);
    query_idx = s_query[r_query_offset + threadIdx.x + i + 1];score = tex2D(texScoreMatrix, query_idx, subject_idx);r_F = s_temp_F[threadIdx.y][threadIdx.x + i + 1] - Gap_extend;r_F = max(r_F, s_temp_H[threadIdx.y][threadIdx.x + i + 1] - Gap_init);r_H = max(r_F,  r_E);r_H = max(r_H, r_H_Upper + score);r_H_Upper = s_temp_H[threadIdx.y][threadIdx.x + i + 1];r_H = max(r_H, 0);r_Max = max(r_H, r_Max);s_temp_F[threadIdx.y][threadIdx.x + i + 1] = r_F;s_temp_H[threadIdx.y][threadIdx.x + i + 1] = r_H;r_E -= Gap_extend;r_E = max(r_E, r_H - Gap_init);
    query_idx = s_query[r_query_offset + threadIdx.x + i + 2];score = tex2D(texScoreMatrix, query_idx, subject_idx);r_F = s_temp_F[threadIdx.y][threadIdx.x + i + 2] - Gap_extend;r_F = max(r_F, s_temp_H[threadIdx.y][threadIdx.x + i + 2] - Gap_init);r_H = max(r_F,  r_E);r_H = max(r_H, r_H_Upper + score);r_H_Upper = s_temp_H[threadIdx.y][threadIdx.x + i + 2];r_H = max(r_H, 0);r_Max = max(r_H, r_Max);s_temp_F[threadIdx.y][threadIdx.x + i + 2] = r_F;s_temp_H[threadIdx.y][threadIdx.x + i + 2] = r_H;r_E -= Gap_extend;r_E = max(r_E, r_H - Gap_init);
    query_idx = s_query[r_query_offset + threadIdx.x + i + 3];score = tex2D(texScoreMatrix, query_idx, subject_idx);r_F = s_temp_F[threadIdx.y][threadIdx.x + i + 3] - Gap_extend;r_F = max(r_F, s_temp_H[threadIdx.y][threadIdx.x + i + 3] - Gap_init);r_H = max(r_F,  r_E);r_H = max(r_H, r_H_Upper + score);r_H_Upper = s_temp_H[threadIdx.y][threadIdx.x + i + 3];r_H = max(r_H, 0);r_Max = max(r_H, r_Max);s_temp_F[threadIdx.y][threadIdx.x + i + 3] = r_F;s_temp_H[threadIdx.y][threadIdx.x + i + 3] = r_H;r_E -= Gap_extend;r_E = max(r_E, r_H - Gap_init);
    query_idx = s_query[r_query_offset + threadIdx.x + i + 4];score = tex2D(texScoreMatrix, query_idx, subject_idx);r_F = s_temp_F[threadIdx.y][threadIdx.x + i + 4] - Gap_extend;r_F = max(r_F, s_temp_H[threadIdx.y][threadIdx.x + i + 4] - Gap_init);r_H = max(r_F,  r_E);r_H = max(r_H, r_H_Upper + score);r_H_Upper = s_temp_H[threadIdx.y][threadIdx.x + i + 4];r_H = max(r_H, 0);r_Max = max(r_H, r_Max);s_temp_F[threadIdx.y][threadIdx.x + i + 4] = r_F;s_temp_H[threadIdx.y][threadIdx.x + i + 4] = r_H;r_E -= Gap_extend;r_E = max(r_E, r_H - Gap_init);
    query_idx = s_query[r_query_offset + threadIdx.x + i + 5];score = tex2D(texScoreMatrix, query_idx, subject_idx);r_F = s_temp_F[threadIdx.y][threadIdx.x + i + 5] - Gap_extend;r_F = max(r_F, s_temp_H[threadIdx.y][threadIdx.x + i + 5] - Gap_init);r_H = max(r_F,  r_E);r_H = max(r_H, r_H_Upper + score);r_H_Upper = s_temp_H[threadIdx.y][threadIdx.x + i + 5];r_H = max(r_H, 0);r_Max = max(r_H, r_Max);s_temp_F[threadIdx.y][threadIdx.x + i + 5] = r_F;s_temp_H[threadIdx.y][threadIdx.x + i + 5] = r_H;r_E -= Gap_extend;r_E = max(r_E, r_H - Gap_init);
    query_idx = s_query[r_query_offset + threadIdx.x + i + 6];score = tex2D(texScoreMatrix, query_idx, subject_idx);r_F = s_temp_F[threadIdx.y][threadIdx.x + i + 6] - Gap_extend;r_F = max(r_F, s_temp_H[threadIdx.y][threadIdx.x + i + 6] - Gap_init);r_H = max(r_F,  r_E);r_H = max(r_H, r_H_Upper + score);r_H_Upper = s_temp_H[threadIdx.y][threadIdx.x + i + 6];r_H = max(r_H, 0);r_Max = max(r_H, r_Max);s_temp_F[threadIdx.y][threadIdx.x + i + 6] = r_F;s_temp_H[threadIdx.y][threadIdx.x + i + 6] = r_H;r_E -= Gap_extend;r_E = max(r_E, r_H - Gap_init);
    query_idx = s_query[r_query_offset + threadIdx.x + i + 7];score = tex2D(texScoreMatrix, query_idx, subject_idx);r_F = s_temp_F[threadIdx.y][threadIdx.x + i + 7] - Gap_extend;r_F = max(r_F, s_temp_H[threadIdx.y][threadIdx.x + i + 7] - Gap_init);r_H = max(r_F,  r_E);r_H = max(r_H, r_H_Upper + score);r_H_Upper = s_temp_H[threadIdx.y][threadIdx.x + i + 7];r_H = max(r_H, 0);r_Max = max(r_H, r_Max);s_temp_F[threadIdx.y][threadIdx.x + i + 7] = r_F;s_temp_H[threadIdx.y][threadIdx.x + i + 7] = r_H;r_E -= Gap_extend;r_E = max(r_E, r_H - Gap_init);
    query_idx = s_query[r_query_offset + threadIdx.x + i + 8];score = tex2D(texScoreMatrix, query_idx, subject_idx);r_F = s_temp_F[threadIdx.y][threadIdx.x + i + 8] - Gap_extend;r_F = max(r_F, s_temp_H[threadIdx.y][threadIdx.x + i + 8] - Gap_init);r_H = max(r_F,  r_E);r_H = max(r_H, r_H_Upper + score);r_H_Upper = s_temp_H[threadIdx.y][threadIdx.x + i + 8];r_H = max(r_H, 0);r_Max = max(r_H, r_Max);s_temp_F[threadIdx.y][threadIdx.x + i + 8] = r_F;s_temp_H[threadIdx.y][threadIdx.x + i + 8] = r_H;r_E -= Gap_extend;r_E = max(r_E, r_H - Gap_init);
    query_idx = s_query[r_query_offset + threadIdx.x + i + 9];score = tex2D(texScoreMatrix, query_idx, subject_idx);r_F = s_temp_F[threadIdx.y][threadIdx.x + i + 9] - Gap_extend;r_F = max(r_F, s_temp_H[threadIdx.y][threadIdx.x + i + 9] - Gap_init);r_H = max(r_F,  r_E);r_H = max(r_H, r_H_Upper + score);r_H_Upper = s_temp_H[threadIdx.y][threadIdx.x + i + 9];r_H = max(r_H, 0);r_Max = max(r_H, r_Max);s_temp_F[threadIdx.y][threadIdx.x + i + 9] = r_F;s_temp_H[threadIdx.y][threadIdx.x + i + 9] = r_H;r_E -= Gap_extend;r_E = max(r_E, r_H - Gap_init);
    query_idx = s_query[r_query_offset + threadIdx.x + i + 10];score = tex2D(texScoreMatrix, query_idx, subject_idx);r_F = s_temp_F[threadIdx.y][threadIdx.x + i + 10] - Gap_extend;r_F = max(r_F, s_temp_H[threadIdx.y][threadIdx.x + i + 10] - Gap_init);r_H = max(r_F,  r_E);r_H = max(r_H, r_H_Upper + score);r_H_Upper = s_temp_H[threadIdx.y][threadIdx.x + i + 10];r_H = max(r_H, 0);r_Max = max(r_H, r_Max);s_temp_F[threadIdx.y][threadIdx.x + i + 10] = r_F;s_temp_H[threadIdx.y][threadIdx.x + i + 10] = r_H;r_E -= Gap_extend;r_E = max(r_E, r_H - Gap_init);
    query_idx = s_query[r_query_offset + threadIdx.x + i + 11];score = tex2D(texScoreMatrix, query_idx, subject_idx);r_F = s_temp_F[threadIdx.y][threadIdx.x + i + 11] - Gap_extend;r_F = max(r_F, s_temp_H[threadIdx.y][threadIdx.x + i + 11] - Gap_init);r_H = max(r_F,  r_E);r_H = max(r_H, r_H_Upper + score);r_H_Upper = s_temp_H[threadIdx.y][threadIdx.x + i + 11];r_H = max(r_H, 0);r_Max = max(r_H, r_Max);s_temp_F[threadIdx.y][threadIdx.x + i + 11] = r_F;s_temp_H[threadIdx.y][threadIdx.x + i + 11] = r_H;r_E -= Gap_extend;r_E = max(r_E, r_H - Gap_init);
    query_idx = s_query[r_query_offset + threadIdx.x + i + 12];score = tex2D(texScoreMatrix, query_idx, subject_idx);r_F = s_temp_F[threadIdx.y][threadIdx.x + i + 12] - Gap_extend;r_F = max(r_F, s_temp_H[threadIdx.y][threadIdx.x + i + 12] - Gap_init);r_H = max(r_F,  r_E);r_H = max(r_H, r_H_Upper + score);r_H_Upper = s_temp_H[threadIdx.y][threadIdx.x + i + 12];r_H = max(r_H, 0);r_Max = max(r_H, r_Max);s_temp_F[threadIdx.y][threadIdx.x + i + 12] = r_F;s_temp_H[threadIdx.y][threadIdx.x + i + 12] = r_H;r_E -= Gap_extend;r_E = max(r_E, r_H - Gap_init);
    query_idx = s_query[r_query_offset + threadIdx.x + i + 13];score = tex2D(texScoreMatrix, query_idx, subject_idx);r_F = s_temp_F[threadIdx.y][threadIdx.x + i + 13] - Gap_extend;r_F = max(r_F, s_temp_H[threadIdx.y][threadIdx.x + i + 13] - Gap_init);r_H = max(r_F,  r_E);r_H = max(r_H, r_H_Upper + score);r_H_Upper = s_temp_H[threadIdx.y][threadIdx.x + i + 13];r_H = max(r_H, 0);r_Max = max(r_H, r_Max);s_temp_F[threadIdx.y][threadIdx.x + i + 13] = r_F;s_temp_H[threadIdx.y][threadIdx.x + i + 13] = r_H;r_E -= Gap_extend;r_E = max(r_E, r_H - Gap_init);
    query_idx = s_query[r_query_offset + threadIdx.x + i + 14];score = tex2D(texScoreMatrix, query_idx, subject_idx);r_F = s_temp_F[threadIdx.y][threadIdx.x + i + 14] - Gap_extend;r_F = max(r_F, s_temp_H[threadIdx.y][threadIdx.x + i + 14] - Gap_init);r_H = max(r_F,  r_E);r_H = max(r_H, r_H_Upper + score);r_H_Upper = s_temp_H[threadIdx.y][threadIdx.x + i + 14];r_H = max(r_H, 0);r_Max = max(r_H, r_Max);s_temp_F[threadIdx.y][threadIdx.x + i + 14] = r_F;s_temp_H[threadIdx.y][threadIdx.x + i + 14] = r_H;r_E -= Gap_extend;r_E = max(r_E, r_H - Gap_init);
    query_idx = s_query[r_query_offset + threadIdx.x + i + 15];score = tex2D(texScoreMatrix, query_idx, subject_idx);r_F = s_temp_F[threadIdx.y][threadIdx.x + i + 15] - Gap_extend;r_F = max(r_F, s_temp_H[threadIdx.y][threadIdx.x + i + 15] - Gap_init);r_H = max(r_F,  r_E);r_H = max(r_H, r_H_Upper + score);r_H_Upper = s_temp_H[threadIdx.y][threadIdx.x + i + 15];r_H = max(r_H, 0);r_Max = max(r_H, r_Max);s_temp_F[threadIdx.y][threadIdx.x + i + 15] = r_F;s_temp_H[threadIdx.y][threadIdx.x + i + 15] = r_H;r_E -= Gap_extend;r_E = max(r_E, r_H - Gap_init);
  }


}

template <int BLOCK_DIM_X, int BLOCK_DIM_Y>
__device__
void kernelSmithWatermanMostInnerLoop(int  r_query_offset, char subject_idx, 
				      int &r_Max, int &r_E, int &r_H_Upper){

  char query_idx;
  
  int r_H, r_F;
  
  MATRIX_ELEMENT score;

  //flag[threadIdx.y][threadIdx.x]++;


  for(int i=0; i<BLOCK_DIM_X; i++){
    // load query index from shared memory.
    query_idx = s_query[r_query_offset + threadIdx.x + i];
    
    // load score matrix's value from texture memory.
    score = tex2D(texScoreMatrix, query_idx, subject_idx);
    //score = tex2D(texScoreMatrix, query_idx, (query_idx * query_idx) % 26);
    
    ///////// start main process ///////// 
    // calculate F
    r_F = s_temp_F[threadIdx.y][threadIdx.x + i] - Gap_extend;
    r_F = max(r_F, s_temp_H[threadIdx.y][threadIdx.x + i] - Gap_init);
    
    // calculate H
    r_H = max(r_F,  r_E);
    r_H = max(r_H, r_H_Upper + score);
    r_H_Upper = s_temp_H[threadIdx.y][threadIdx.x + i];
    r_H = max(r_H, 0);
    
    r_Max = max(r_H, r_Max);
    
    // store H and E
    s_temp_F[threadIdx.y][threadIdx.x + i] = r_F;
    s_temp_H[threadIdx.y][threadIdx.x + i] = r_H;
    
    
    // calculate "next" E
    r_E -= Gap_extend;
    r_E = max(r_E, r_H - Gap_init);
    ///////// end main process ///////// 
  }
}

template <int BLOCK_DIM_X, int BLOCK_DIM_Y>
__global__
void kernelSmithWaterman(char *seq_data,
			 int *seq_index,
			 char* query_seq,
			 int query_len,
			 TempScore *g_temp_HF,
			 int *g_result){
  
  int r_E, r_H_Upper;
  int  r_Max;

  short2 r_temp;
  
  const int tid = BLOCK_DIM_X * threadIdx.y + threadIdx.x;
  
  // $B=hM}$9$k%/%(%j$NHV9f(B
  const int qid = BLOCK_DIM_Y * blockIdx.x + threadIdx.y;
   
  int r_temp_offset = query_len * BLOCK_DIM_Y * blockIdx.x + tid;

  // $B%G!<%?%Y!<%9%7!<%1%s%9$N%$%s%G%C%/%9(B
  int r_subject_data_index = seq_index[qid] + 31 - threadIdx.x;
  
  int r_query_offset = BLOCK_DIM_X * BLOCK_DIM_Y + tid;
  
  // $B=hM}$9$k%7!<%1%s%9$ND9$5(B
  const int subject_length = seq_index[qid + 1] - seq_index[qid];
  
  char amino_matrix_subject_idx;

  flag[threadIdx.y][threadIdx.x] = 0;

  if(threadIdx.y == 0){
    s_query[r_query_offset] = query_seq[tid];
  }
  __syncthreads();

  r_Max = 0;

  g_temp_HF[r_temp_offset + (query_len) * BLOCK_DIM_Y] = make_short2(0, 0);
  
  // load a subject index from device memory.
  amino_matrix_subject_idx = seq_data[r_subject_data_index]; 

  r_E = r_H_Upper = 0;

  s_temp_H[threadIdx.y][threadIdx.x] = 0;
  s_temp_F[threadIdx.y][threadIdx.x] = 0;
  
  int q_idx;
  for(q_idx = 0; q_idx < query_len - BLOCK_DIM_X; q_idx += BLOCK_DIM_X){
    // load query indexes to shared memory from device memory.
    if(q_idx % 256 == 0){
      // move last 32 query indexes to first.
       __syncthreads();
      s_query[tid] = s_query[r_query_offset];
      __syncthreads();
      s_query[tid + BLOCK_DIM_X] = query_seq[q_idx + tid + BLOCK_DIM_X];
      __syncthreads();
    }

    s_temp_F[threadIdx.y][threadIdx.x + BLOCK_DIM_X] = 0;
    s_temp_H[threadIdx.y][threadIdx.x + BLOCK_DIM_X] = 0;
    
    kernelSmithWatermanMostInnerLoopUnrolled<BLOCK_DIM_X, BLOCK_DIM_Y>
      (q_idx % (BLOCK_DIM_X * BLOCK_DIM_Y), amino_matrix_subject_idx, r_Max, r_E, r_H_Upper);

    r_temp = make_short2(s_temp_H[threadIdx.y][threadIdx.x],
			 s_temp_F[threadIdx.y][threadIdx.x]);
    g_temp_HF[r_temp_offset + q_idx * BLOCK_DIM_Y] = r_temp;
      
    s_temp_H[threadIdx.y][threadIdx.x] = s_temp_H[threadIdx.y][threadIdx.x + BLOCK_DIM_X];
    s_temp_F[threadIdx.y][threadIdx.x] = s_temp_F[threadIdx.y][threadIdx.x + BLOCK_DIM_X];
  }

  // load query indexes to shared memory from device memory.
  if(q_idx % 256 == 0){
    // move last 32 query indexes to first.
    __syncthreads();
    s_query[tid] = s_query[r_query_offset];
    __syncthreads();
    s_query[tid + BLOCK_DIM_X] = query_seq[q_idx + tid + BLOCK_DIM_X];
    __syncthreads();
  }
    
  s_temp_F[threadIdx.y][threadIdx.x + BLOCK_DIM_X] = 0;
  s_temp_H[threadIdx.y][threadIdx.x + BLOCK_DIM_X] = 0;
      
  kernelSmithWatermanMostInnerLoopUnrolled<BLOCK_DIM_X, BLOCK_DIM_Y>
    (q_idx % (BLOCK_DIM_X * BLOCK_DIM_Y), amino_matrix_subject_idx, r_Max, r_E, r_H_Upper);
  r_temp = make_short2(s_temp_H[threadIdx.y][threadIdx.x],
		       s_temp_F[threadIdx.y][threadIdx.x]);
  g_temp_HF[r_temp_offset + q_idx * BLOCK_DIM_Y] = r_temp;
  
  int subject_idx;
  for(subject_idx = BLOCK_DIM_X;
      subject_idx < subject_length;
      subject_idx += BLOCK_DIM_X) {
    // load a subject index from device memory.
    amino_matrix_subject_idx = seq_data[r_subject_data_index + subject_idx]; 
    /*
    if(blockIdx.x == 0 && threadIdx.y == 0){
      //g_result[r_subject_data_index + subject_idx] = amino_matrix_subject_idx;
      g_result[31 - threadIdx.x + subject_idx] = amino_matrix_subject_idx;
    }
    */
    //amino_matrix_subject_idx = 1;
    
    r_E = r_H_Upper = 0;
    
    s_temp_H[threadIdx.y][threadIdx.x] = 0;
    s_temp_F[threadIdx.y][threadIdx.x] = 0;
    
    int q_idx;
    for(q_idx = 0; q_idx < query_len - BLOCK_DIM_X; q_idx += BLOCK_DIM_X){
      // load query indexes to shared memory from device memory.
      if(q_idx % 256 == 0){
	// move last 32 query indexes to first.
	__syncthreads();
	s_query[tid] = s_query[r_query_offset];
	__syncthreads();
	s_query[tid + BLOCK_DIM_X] = query_seq[q_idx + tid + BLOCK_DIM_X];
	__syncthreads();
      }

      r_temp = g_temp_HF[r_temp_offset + (q_idx + BLOCK_DIM_X) * BLOCK_DIM_Y];
      
      s_temp_H[threadIdx.y][threadIdx.x + BLOCK_DIM_X]
	= r_temp.x;
      s_temp_F[threadIdx.y][threadIdx.x + BLOCK_DIM_X]
	= r_temp.y;
      
      kernelSmithWatermanMostInnerLoopUnrolled<BLOCK_DIM_X, BLOCK_DIM_Y>
	(q_idx % (BLOCK_DIM_X * BLOCK_DIM_Y), amino_matrix_subject_idx, r_Max, r_E, r_H_Upper);
      
      r_temp = make_short2(s_temp_H[threadIdx.y][threadIdx.x],
			   s_temp_F[threadIdx.y][threadIdx.x]);
      g_temp_HF[r_temp_offset + q_idx * BLOCK_DIM_Y] = r_temp;
      
      s_temp_H[threadIdx.y][threadIdx.x] = s_temp_H[threadIdx.y][threadIdx.x + BLOCK_DIM_X];
      s_temp_F[threadIdx.y][threadIdx.x] = s_temp_F[threadIdx.y][threadIdx.x + BLOCK_DIM_X];
    }

    // load query indexes to shared memory from device memory.
    if(q_idx % 256 == 0){
      // move last 32 query indexes to first.
      __syncthreads();
      s_query[tid] = s_query[r_query_offset];
      __syncthreads();
      s_query[tid + BLOCK_DIM_X] = query_seq[q_idx + tid + BLOCK_DIM_X];
      __syncthreads();
    }
    
    r_temp = g_temp_HF[r_temp_offset + (q_idx + BLOCK_DIM_X) * BLOCK_DIM_Y];
      
    s_temp_H[threadIdx.y][threadIdx.x + BLOCK_DIM_X]
      = r_temp.x;
    s_temp_F[threadIdx.y][threadIdx.x + BLOCK_DIM_X]
      = r_temp.y;
      
    kernelSmithWatermanMostInnerLoopUnrolled<BLOCK_DIM_X, BLOCK_DIM_Y>
      (q_idx % (BLOCK_DIM_X * BLOCK_DIM_Y), amino_matrix_subject_idx, r_Max, r_E, r_H_Upper);
    
    r_temp = make_short2(s_temp_H[threadIdx.y][threadIdx.x],
			 s_temp_F[threadIdx.y][threadIdx.x]);
    g_temp_HF[r_temp_offset + q_idx * BLOCK_DIM_Y] = r_temp;
  }
  
  // get the maximum score of 1 warp
  reduction(r_Max);
  
  if(threadIdx.x == 0){
    g_result[qid] = r_Max;
    //g_result[qid] = seq_data[r_subject_data_index];
    //g_result[qid] = r_Max;
    //g_result[qid] = subject_idx;
    //g_result[qid] = subject_length;
    //g_result[qid] = seq_index[qid];
  }
  
  //g_result[tid] = 0;
}
