!--------*---------*---------*---------*---------*---------*---------*-*
!     
! File: ev_test.f
!
! Purpose
! =======
!
! < purpose of this module ... >
!
!
! Copyright(C) 2012-2020 RIKEN.
! Copyright(C) 2011-2012 Toshiyuki Imamura
!                        Graduate School of Informatics and Engineering,
!                        The University of Electro-Communications.
! Copyright (C) 2011- 2015 Japan Atomic Energy Agency.
! 
! Redistribution  and  use  in  source and binary forms, with or without
! modification,  are  permitted  provided  that the following conditions
! are met:
! 
! * Redistributions  of  source  code  must  retain  the above copyright
!   notice,  this  list  of  conditions  and  the  following  disclaimer.
! * Redistributions  in  binary  form must reproduce the above copyright
!   notice,  this list of conditions and the following disclaimer in the
!   documentation  and/or other materials provided with the distribution.
! 
! THIS  SOFTWARE  IS  PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! ``AS IS''  AND  ANY  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
! LIMITED  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
! A  PARTICULAR  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
! HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
! SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL  DAMAGES  (INCLUDING,  BUT NOT
! LIMITED  TO,  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
! DATA,  OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
! THEORY  OF  LIABILITY,  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
! (INCLUDING  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
! OF  THIS  SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
!
!--------*---------*---------*---------*---------*---------*---------*-*

      subroutine ev_test(n, nvec, a, nma, w, z, nmz)

      use eigen_libs_mod
      use comm_mod
      use mpi
!$    use omp_lib

      implicit none

      integer, intent(in)    :: n, nvec, nma, nmz
      real(8), intent(in)    :: a(1:nma,*)
      real(8), intent(in)    :: w(1:n)
      real(8), intent(in)    :: z(1:nmz,*)

      integer, parameter     :: BLK = 3

      real(8), allocatable   :: b(:)
      real(8), allocatable   :: c(:,:), d(:,:), e(:,:)

      real(8)                :: m_epsilon

      integer                :: iloop_sta, iloop_end
      integer                :: jloop_sta, jloop_end
      integer                :: i, i_1, i_4
      integer                :: j, j_1, j_4
      integer                :: i0, k, ii_1, jj_1
      integer                :: ierr, k_4, i_m, i_mx, larray
      real(8)                :: a0, a1, a2, a3
      real(8)                :: z0, z1, z2, z3
      real(8)                :: s0, s1, s2, s3
      real(8)                :: t0, t1, t2, t3
      real(8)                :: ss0, ss1, ss2, ss3
      real(8)                :: tt0, tt1, tt2, tt3
      real(8)                :: s_0, s_1, s_2, s_3
      real(8)                :: t_0, t_1, t_2, t_3
      real(8)                :: b0, bb0, b_0, ra, rr, rz, r
      real(8)                :: f, ff(1:BLK)

      integer                :: TRD_COMM_WORLD, x_comm, y_comm
      integer                :: nnod, x_nnod, y_nnod
      integer                :: inod, x_inod, y_inod
      integer                :: x_owner_nod, y_owner_nod

      integer                :: local_size
      real(8), pointer       :: b0_(:)
      real(8), pointer       :: rz_(:)
      real(8), pointer       :: u_t(:), u_s(:)


      local_size = 1
!$OMP PARALLEL
!$OMP MASTER
!$    local_size = omp_get_num_threads()
!$OMP END MASTER
!$OMP END PARALLEL
!     
      allocate(b0_(0:local_size))
      allocate(rz_(0:local_size))

      m_epsilon = get_constant_eps()

      rz=0D0; rz_(:)=0D0
      rr=0D0
      r =0D0
      b0_(:)=0D0

      call eigen_get_comm (TRD_COMM_WORLD, x_comm, y_comm)
      call eigen_get_procs(nnod, x_nnod, y_nnod)
      call eigen_get_id   (inod, x_inod, y_inod)

      jloop_sta = eigen_loop_start(1, 'X')
      jloop_end = eigen_loop_end  (n, 'X')
      iloop_sta = eigen_loop_start(1, 'Y')
      iloop_end = eigen_loop_end  (n, 'Y')

      j_4 = eigen_translate_g2l   (n, 'X')
      i_4 = eigen_translate_g2l   (n, 'Y')

      k_4 = max(i_4, j_4)+8

      larray = n
      allocate(b(1:larray))
      b(1:n)=0d0

      larray = j_4
      allocate(c(1:larray,1:BLK))
      c(:,:)=0d0

      larray = max(n+1, 3*k_4, x_nnod, y_nnod)
      allocate(d(1:larray,1:BLK), e(1:larray,1:BLK))
      d(:,:)=0d0
      e(:,:)=0d0

      larray = n+max(x_nnod, y_nnod)
      allocate(u_t(1:larray), u_s(1:larray))
      u_t(:)=0
      u_s(:)=0

      ra =0.0D+00
      if (iloop_sta <= iloop_end) then
!$OMP PARALLEL DO
!$OMP+         PRIVATE(i_1,j_1,t0,a0)
        do i_1 = iloop_sta, iloop_end
          t0 = 0.0D+00
          do j_1 = jloop_sta, jloop_end
            a0 = a(j_1,i_1)
            t0 = t0 + abs(a0)   ! u = |A|_1
          end do
          d(i_1,1) = t0
        end do
!$OMP END PARALLEL DO

#ifdef ALLREDUCE_MPI
        call MPI_Allreduce(d(1,1), e(1,1), (iloop_end-iloop_sta+1),
     &       MPI_DOUBLE_PRECISION,
     &       MPI_SUM, x_comm, ierr)
#else
      call ALLREDUCE_binary_sum(x_comm,(iloop_end-iloop_sta+1),d,e)
#endif

!$OMP PARALLEL DO
!$OMP+         REDUCTION(MAX:ra)
        do i_1 = iloop_sta, iloop_end
          ra = max(ra, e(i_1,1))
        end do
!$OMP END PARALLEL DO
      end if

      iloop_sta = eigen_loop_start   ( 1,    'Y' )
      iloop_end = eigen_loop_end     ( nvec, 'Y' )

      if (iloop_sta <= iloop_end) then
        i_m = mod(iloop_end-iloop_sta+1,4)+iloop_sta
      else
        i_m = iloop_end+1
      end if
        i_m = iloop_end+1

!      i_mx = MOD(n,BLK)+1
!=      i_mx = n+1
!=      i_mx = nvec+1
      i_mx = MOD(nvec,BLK)+1

      do i=1,i_mx-1,1

        x_owner_nod = eigen_owner_node(i, 'X')
        y_owner_nod = eigen_owner_node(i, 'Y')
        jj_1        = eigen_translate_g2l(i, 'X')
        ii_1        = eigen_translate_g2l(i, 'Y')

        if (y_owner_nod == y_inod) then
          do j_1 = jloop_sta, jloop_end
            c(j_1,1) = z(j_1, ii_1)
          end do
          ff(1) = 1.0D+00
        else
          ff(1) = 0.0D+00
        end if
        call MPI_Bcast(c(1,1), j_4, MPI_DOUBLE_PRECISION,
     &       y_owner_nod-1, y_comm, ierr)

        if (iloop_sta <= iloop_end) then
          do i_1 = iloop_sta, i_m-1
            t0 = 0.0D+00
            s0 = 0.0D+00
            do j_1 = jloop_sta, jloop_end
              a0 = a(j_1,i_1)
              b0 = c(j_1,1)
              t0 = t0 + b0 * a0 ! t = Az(i)
              s0 = s0 + b0 * z(j_1, i_1) ! s = Zz(i)
            end do              ! k
            d(1+(i_1-1)*2,1) = t0
            d(2+(i_1-1)*2,1) = s0
          end do

!$OMP PARALLEL DO
!$OMP+         PRIVATE(i_1,j_1,a0,a1,a2,a3,b0,
!$OMP+                 t0,s0,t1,s1,t2,s2,t3,s3)
          do i_1 = i_m, iloop_end, 4
            t0 = 0.0D+00
            t1 = 0.0D+00
            t2 = 0.0D+00
            t3 = 0.0D+00
            s0 = 0.0D+00
            s1 = 0.0D+00
            s2 = 0.0D+00
            s3 = 0.0D+00
!DIR$ SIMD
!$OMP SIMD
            do j_1 = jloop_sta, jloop_end
              a0 = a(j_1,i_1+0)
              a1 = a(j_1,i_1+1)
              a2 = a(j_1,i_1+2)
              a3 = a(j_1,i_1+3)
              b0 = c(j_1,1)
              t0 = t0 + b0 * a0 ! t = Az(i)
              t1 = t1 + b0 * a1 ! t = Az(i)
              t2 = t2 + b0 * a2 ! t = Az(i)
              t3 = t3 + b0 * a3 ! t = Az(i)
              s0 = s0 + b0 * z(j_1, i_1+0) ! s = Zz(i)
              s1 = s1 + b0 * z(j_1, i_1+1) ! s = Zz(i)
              s2 = s2 + b0 * z(j_1, i_1+2) ! s = Zz(i)
              s3 = s3 + b0 * z(j_1, i_1+3) ! s = Zz(i)
            end do              ! k
            d(1+(i_1-1)*2,1) = t0
            d(2+(i_1-1)*2,1) = s0
            d(1+(i_1  )*2,1) = t1
            d(2+(i_1  )*2,1) = s1
            d(1+(i_1+1)*2,1) = t2
            d(2+(i_1+1)*2,1) = s2
            d(1+(i_1+2)*2,1) = t3
            d(2+(i_1+2)*2,1) = s3
          end do
!$OMP END PARALLEL DO
        else

          d(1:2*j_4,1) = 0D0
          e(1:2*j_4,1) = 0D0

        end if

        if (iloop_sta <= iloop_end) then

#ifdef ALLREDUCE_MPI
          call MPI_Allreduce(d(1,1), e(1,1), 2*(iloop_end-iloop_sta+1),
     &         MPI_DOUBLE_PRECISION,
     &         MPI_SUM, x_comm, ierr)
#else
      call ALLREDUCE_binary_sum(x_comm,2*(iloop_end-iloop_sta+1),d,e)
#endif
        end if

        call datacast_dbl(d(1,1), c(1,1),
     &       u_t, u_s, n, 0)

        x_owner_nod = eigen_owner_node(i, 'X')
        ii_1        = eigen_translate_g2l(i, 'Y')
        if (x_owner_nod == x_inod) then
          b0_ =0.0D+00
          j = 0

          if (iloop_sta <= iloop_end) then
            f = ff(1)
!$OMP PARALLEL DO
!$OMP+         PRIVATE(i_1,j,s0,t0)
            do i_1 = iloop_sta, iloop_end
!$            j = omp_get_thread_num()
              t0 = e(1+(i_1-1)*2,1)
              s0 = e(2+(i_1-1)*2,1)
              t0 = t0 - w(i) * d(i_1,1) ! t = Az(i) - wz(i)
              if ( i_1 == ii_1 ) s0 = s0 - f ! s = Zz(i) - 1(i)
              b0_(j) = b0_(j) + ABS(t0) ! b(i)=|Az(i)-wz(i)|_1
              rz_(j) = rz_(j) + s0**2 ! rz  =|ZZ-I|_F
            end do
!$OMP END PARALLEL DO
          end if
          b0 =0.0D+00
          do i_1=0,local_size-1
            b0 = b0 + b0_(i_1)
          end do
          b(i) = b0
        else
          b(i) =0.0D+00
        end if

      end do
!     >
!      do i=i_mx,n,BLK
      do i=i_mx,nvec,BLK

        do i0=0,BLK-1
          x_owner_nod = eigen_owner_node(i+i0, 'X')
          y_owner_nod = eigen_owner_node(i+i0, 'Y')
          jj_1        = eigen_translate_g2l(i+i0, 'X')
          ii_1        = eigen_translate_g2l(i+i0, 'Y')
          if (y_owner_nod == y_inod) then
            do j_1 = jloop_sta, jloop_end
              c(j_1,1+i0) = z(j_1, ii_1)
            end do
            ff(1+i0) = 1D0
          else
            ff(1+i0) =0.0D+00
          end if
          call MPI_Bcast(c(1,1+i0), j_4, MPI_DOUBLE_PRECISION,
     &         y_owner_nod-1, y_comm, ierr)
        end do

        if (iloop_sta <= iloop_end) then
          do i_1 = iloop_sta, i_m-1
            t0 = 0.0D+00
            s0 = 0.0D+00
            tt0 = 0.0D+00
            ss0 = 0.0D+00
            t_0 = 0.0D+00
            s_0 = 0.0D+00
!DIR$ SIMD
!$OMP SIMD
            do j_1 = jloop_sta, jloop_end
              a0 = a(j_1,i_1)
              z0 = z(j_1,i_1)
              b0  = c(j_1,1)
              bb0 = c(j_1,2)
              b_0 = c(j_1,3)
              t0  = t0  + b0  * a0 ! t = Az(i)
              s0  = s0  + b0  * z0 ! s = Zz(i)
              tt0 = tt0 + bb0 * a0 ! t = Az(i)
              ss0 = ss0 + bb0 * z0 ! s = Zz(i)
              t_0 = t_0 + b_0 * a0 ! t = Az(i)
              s_0 = s_0 + b_0 * z0 ! s = Zz(i)
            end do              ! k
            d(1+(i_1-1)*2,1) = t0
            d(2+(i_1-1)*2,1) = s0
            d(1+(i_1-1)*2,2) = tt0
            d(2+(i_1-1)*2,2) = ss0
            d(1+(i_1-1)*2,3) = t_0
            d(2+(i_1-1)*2,3) = s_0
          end do

!$OMP PARALLEL DO
!$OMP+         PRIVATE(i_1,j_1,jj_1,a0,a1,a2,a3,
!$OMP+                 z0,z1,z2,z3,b0,bb0,b_0,
!$OMP+                 t0,s0,t1,s1,t2,s2,t3,s3,
!$OMP+                 tt0,ss0,tt1,ss1,tt2,ss2,tt3,ss3,
!$OMP+                 t_0,s_0,t_1,s_1,t_2,s_2,t_3,s_3)
          do i_1 = i_m, iloop_end, 4
            t0 =0.0D+00
            s0 =0.0D+00
            t1 =0.0D+00
            s1 =0.0D+00
            t2 =0.0D+00
            s2 =0.0D+00
            t3 =0.0D+00
            s3 =0.0D+00
            tt0 =0.0D+00
            ss0 =0.0D+00
            tt1 =0.0D+00
            ss1 =0.0D+00
            tt2 =0.0D+00
            ss2 =0.0D+00
            tt3 =0.0D+00
            ss3 =0.0D+00
            t_0 =0.0D+00
            s_0 =0.0D+00
            t_1 =0.0D+00
            s_1 =0.0D+00
            t_2 =0.0D+00
            s_2 =0.0D+00
            t_3 =0.0D+00
            s_3 =0.0D+00
!DIR$ SIMD
!$OMP SIMD
            do j_1 = jloop_sta, jloop_end
              a0 = a(j_1,i_1+0)
              a1 = a(j_1,i_1+1)
              a2 = a(j_1,i_1+2)
              a3 = a(j_1,i_1+3)
              z0 = z(j_1,i_1+0)
              z1 = z(j_1,i_1+1)
              z2 = z(j_1,i_1+2)
              z3 = z(j_1,i_1+3)
              b0  = c(j_1,1)
              bb0 = c(j_1,2)
              b_0 = c(j_1,3)
              t0  = t0  + b0  * a0 ! t = Az(i)
              tt0 = tt0 + bb0 * a0 ! t = Az(i)
              t_0 = t_0 + b_0 * a0 ! t = Az(i)
              t1  = t1  + b0  * a1 ! t = Az(i)
              tt1 = tt1 + bb0 * a1 ! t = Az(i)
              t_1 = t_1 + b_0 * a1 ! t = Az(i)
              t2  = t2  + b0  * a2 ! t = Az(i)
              tt2 = tt2 + bb0 * a2 ! t = Az(i)
              t_2 = t_2 + b_0 * a2 ! t = Az(i)
              t3  = t3  + b0  * a3 ! t = Az(i)
              tt3 = tt3 + bb0 * a3 ! t = Az(i)
              t_3 = t_3 + b_0 * a3 ! t = Az(i)
              s0  = s0  + b0  * z0 ! s = Zz(i)
              ss0 = ss0 + bb0 * z0 ! s = Zz(i)
              s_0 = s_0 + b_0 * z0 ! s = Zz(i)
              s1  = s1  + b0  * z1 ! s = Zz(i)
              ss1 = ss1 + bb0 * z1 ! s = Zz(i)
              s_1 = s_1 + b_0 * z1 ! s = Zz(i)
              s2  = s2  + b0  * z2 ! s = Zz(i)
              ss2 = ss2 + bb0 * z2 ! s = Zz(i)
              s_2 = s_2 + b_0 * z2 ! s = Zz(i)
              s3  = s3  + b0  * z3 ! s = Zz(i)
              ss3 = ss3 + bb0 * z3 ! s = Zz(i)
              s_3 = s_3 + b_0 * z3 ! s = Zz(i)
            end do              ! k
            d(1+(i_1-1)*2,1) = t0
            d(2+(i_1-1)*2,1) = s0
            d(1+(i_1  )*2,1) = t1
            d(2+(i_1  )*2,1) = s1
            d(1+(i_1+1)*2,1) = t2
            d(2+(i_1+1)*2,1) = s2
            d(1+(i_1+2)*2,1) = t3
            d(2+(i_1+2)*2,1) = s3
            d(1+(i_1-1)*2,2) = tt0
            d(2+(i_1-1)*2,2) = ss0
            d(1+(i_1  )*2,2) = tt1
            d(2+(i_1  )*2,2) = ss1
            d(1+(i_1+1)*2,2) = tt2
            d(2+(i_1+1)*2,2) = ss2
            d(1+(i_1+2)*2,2) = tt3
            d(2+(i_1+2)*2,2) = ss3
            d(1+(i_1-1)*2,3) = t_0
            d(2+(i_1-1)*2,3) = s_0
            d(1+(i_1  )*2,3) = t_1
            d(2+(i_1  )*2,3) = s_1
            d(1+(i_1+1)*2,3) = t_2
            d(2+(i_1+1)*2,3) = s_2
            d(1+(i_1+2)*2,3) = t_3
            d(2+(i_1+2)*2,3) = s_3
          end do
!$OMP END PARALLEL DO
        else

          d(1:2*j_4,1) = 0D0
          e(1:2*j_4,1) = 0D0

        end if

        if (iloop_sta <= iloop_end) then

          do i0=0,BLK-1

#ifdef ALLREDUCE_MPI
            call MPI_Allreduce(d(1,1+i0), e(1,1+i0), 
     &           2*(iloop_end-iloop_sta+1),
     &           MPI_DOUBLE_PRECISION,
     &           MPI_SUM, x_comm, ierr)
#else
      call ALLREDUCE_binary_sum(x_comm,2*(iloop_end-iloop_sta+1),
     &           d(1,1+i0), e(1,1+i0) )
#endif
          end do

        end if

        do i0=0,BLK-1
          call datacast_dbl(d(1,1+i0), c(1,1+i0),
     &         u_t, u_s, n, 0)
        end do

        do i0=0,BLK-1
          x_owner_nod = eigen_owner_node(i+i0, 'X')
          ii_1        = eigen_translate_g2l(i+i0, 'Y')
          if (x_owner_nod == x_inod) then
            b0_ =0.0D+00
            j = 0
            if (iloop_sta <= iloop_end) then
              f = ff(1+i0)
!$OMP PARALLEL DO
!$OMP+         PRIVATE(i_1,j,s0,t0)
              do i_1 = iloop_sta, iloop_end
!$              j = omp_get_thread_num()
                t0 = e(1+(i_1-1)*2,1+i0)
                s0 = e(2+(i_1-1)*2,1+i0)
                t0 = t0 - w(i+i0) * d(i_1,1+i0) ! t = Az(i) - wz(i)
                if ( i_1 == ii_1 ) s0 = s0 - f ! s = Zz(i) - 1(i)
!     s0 = s0 - ff(1+i0)*((n-iabs(i_1-ii_1))/n)
                b0_(j) = b0_(j) + ABS(t0) ! b(i)=|Az(i)-wz(i)|_1
                rz_(j) = rz_(j) + s0**2 ! rz  =|ZZ-I|_F
              end do
!$OMP END PARALLEL DO
            end if
            b0 =0.0D+00
            do i_1=0,local_size-1
              b0 = b0 + b0_(i_1)
            end do
            b(i+i0) = b0
          else
            b(i+i0) =0.0D+00
          end if
        end do

      end do
!     >
      rz = 0.0D0
      do i_1=0,local_size-1
        rz = rz + rz_(i_1)
      end do
!     >
      d(1:n,1)=b(1:n)
      d(n+1,1)=rz

#ifdef ALLREDUCE_MPI
      call MPI_Allreduce(d(1,1), e(1,1), n+1, MPI_DOUBLE_PRECISION,
     &     MPI_SUM, TRD_COMM_WORLD, ierr)
#else
      call ALLREDUCE_binary_sum(TRD_COMM_WORLD,n+1,d,e)
#endif
      b(1:n)=e(1:n,1)
      rz    =e(n+1,1)

      r=ra
      call MPI_Allreduce(ra, r, 1, MPI_DOUBLE_PRECISION,
     &     MPI_MAX, TRD_COMM_WORLD, ierr)
      ra=r

      rr=0d0
      do i=1,n
        if (b(i) > rr) then
          rr=b(i); k=i
        end if
      end do                    ! i

      if (x_inod == 1 .and. y_inod == 1) then
!     print*, "max|Ax-wx|=", rr, k
        print*, "|A|_{1}=", ra
        print*, "epsilon=", m_epsilon
        rr =  rr/(n*m_epsilon*ra)
        print*, "max|Ax-wx|_{1}/Ne|A|_{1}=", rr, k
        if ( rr < sqrt(8d0) ) then
          print*,"*** Residual Error Test ***   : PASSED"
        else
          print*,"***=============================******"
          print*,"*** Residual Error Test ***   : FAILED"
          print*,"***=============================******"
        end if
        print*, "|ZZ-I|_{F}/sqrt(N)=", SQRT(rz/n)
        if ( SQRT(rz / n) < 128 * m_epsilon ) then
          print*,"*** Orthogonality  Test ***   : PASSED"
        else
          print*,"***=============================******"
          print*,"*** Orthogonality  Test ***   : FAILED"
          print*,"***=============================******"
        end if
      endif

      deallocate(b)
      deallocate(c)
      deallocate(d)
      deallocate(e)

      deallocate(u_t)
      deallocate(u_s)

      deallocate(b0_)
      deallocate(rz_)

      return

      end subroutine ev_test
