#define SHIFT 9
#define MASK  255

      subroutine init_grids
      use grid_arrays
      implicit none
#ifdef MPI
      include "mpif.h"
#endif
      include 'DIMENSIONS'
      include 'COMMON.CHAIN'
      include 'COMMON.SPLITELE'
      include "COMMON.IOUNITS"
      include 'COMMON.GRID'
      double precision isqrt3
      integer d,total_cells,row_length
      integer res1,res2,res3,res4,res5

      isqrt3=1.0d0/dsqrt(3.0d0)

      negcellsize(1)=r_cut_int+r_buff_list
      negcellsize(2)=r_cut_int+r_buff_list
      negcellsize(3)=r_cut_int+r_buff_list

      neggridsize(1)=ceiling(boxxsize/negcellsize(1))
      neggridsize(2)=ceiling(boxysize/negcellsize(2))
      neggridsize(3)=ceiling(boxzsize/negcellsize(3))

      neggridmerge(1)=negcellsize(1)*neggridsize(1).ne.boxxsize
      neggridmerge(2)=negcellsize(2)*neggridsize(2).ne.boxysize
      neggridmerge(3)=negcellsize(3)*neggridsize(3).ne.boxzsize

      if(neggridmerge(1)) neggridsize(1)=neggridsize(1)-1
      if(neggridmerge(2)) neggridsize(2)=neggridsize(2)-1
      if(neggridmerge(3)) neggridsize(3)=neggridsize(3)-1

      negcellsize(1)=boxxsize/neggridsize(1)
      negcellsize(2)=boxysize/neggridsize(2)
      negcellsize(3)=boxzsize/neggridsize(3)

      poscellsize(1)=(r_cut_int+r_buff_list)*isqrt3
      poscellsize(2)=(r_cut_int+r_buff_list)*isqrt3
      poscellsize(3)=(r_cut_int+r_buff_list)*isqrt3

      posgridsize(1)=ceiling(boxxsize/poscellsize(1))
      posgridsize(2)=ceiling(boxysize/poscellsize(2))
      posgridsize(3)=ceiling(boxzsize/poscellsize(3))

      total_cells=neggridsize(1)*neggridsize(2)*neggridsize(3)
      row_length=total_cells+(16-mod(total_cells,16))+16
      max_grid_cells=total_cells

      allocate(dgridptr(-1:row_length-2,max_fg_threads),stat=res1)
      allocate(dgridrangep(-2:row_length-3),stat=res2)
      allocate(dgridrangepn(-2:row_length-3),stat=res3)
      allocate(dgridrangesc(-2:row_length-3),stat=res4)
      allocate(th_grid_count(-1:row_length-2,max_fg_threads),stat=res5)
      if((res1.ne.0).or.(res2.ne.0).or.(res3.ne.0).or.
     &   (res4.ne.0).or.(res5.ne.0)) then
        write (iout,*) "ERROR: cannot allocate enough memory"
#ifdef MPI
        call MPI_Abort(MPI_COMM_WORLD)
#endif
        stop
      endif

      end subroutine

C----------------------------------------------------------------------
      subroutine getcellid(pt,dx,dy,dz,cellid,
     &                     gridsize,invcellsize,gridmerge)
      implicit none
      include 'DIMENSIONS'
      include 'COMMON.GRID'
      double precision pt(3),invcellsize(3),x,y,z
      integer cellid,ix,iy,iz,dx,dy,dz,gridsize(3)
      logical gridmerge(3)
      x=pt(1)
      y=pt(2)
      z=pt(3)
      ix=idint(x*invcellsize(1))+dx
      iy=idint(y*invcellsize(2))+dy
      iz=idint(z*invcellsize(3))+dz
      if(ix.lt.0) ix=ix+gridsize(1)
      if(iy.lt.0) iy=iy+gridsize(2)
      if(iz.lt.0) iz=iz+gridsize(3)
      if(ix.ge.gridsize(1)) ix=ix-gridsize(1)
      if(iy.ge.gridsize(2)) iy=iy-gridsize(2)
      if(iz.ge.gridsize(3)) iz=iz-gridsize(3)
      !if(gridmerge(1).and.ix.ge.gridsize(1)-1) ix=gridsize(1)-1
      !if(gridmerge(2).and.iy.ge.gridsize(2)-1) iy=gridsize(2)-1
      !if(gridmerge(3).and.iz.ge.gridsize(3)-1) iz=gridsize(3)-1
      cellid=ix+gridsize(1)*(iy+iz*gridsize(2))
      end subroutine

C----------------------------------------------------------------------
      subroutine getcellmask(input,ics,fx,fy,fz,neighbours)
      implicit none
      double precision input(3),ics(3),fx,fy,fz,thr
      integer*4 neighbours
      neighbours=b'111111111111111111111111111'
      fx=input(1)*ics(1)
      fx=fx-dint(fx)
      fy=input(2)*ics(2)
      fy=fy-dint(fy)
      fz=input(3)*ics(3)
      fz=fz-dint(fz)

      thr=1.0d0-fx*fx-fy*fy     !    222222211111111110000000000
      if (0.0d0.ge.thr)         !    654321098765432109876543210
     &  neighbours=iand(neighbours,b'111111110111111110111111110')
      if (1.0d0-2.0d0*fx.ge.thr)
     &  neighbours=iand(neighbours,b'111111011111111011111111011')
      if (1.0d0-2.0d0*fy.ge.thr)
     &  neighbours=iand(neighbours,b'110111111110111111110111111')
      if (2.0d0-2.0d0*(fx+fy).ge.thr)
     &  neighbours=iand(neighbours,b'011111111011111111011111111')

      thr=1.0d0-fy*fy-fz*fz     !    222222211111111110000000000
      if (0.0d0.ge.thr)         !    654321098765432109876543210
     &  neighbours=iand(neighbours,b'111111111111111111111111000')
      if (1.0d0-2.0d0*fy.ge.thr)
     &  neighbours=iand(neighbours,b'111111111111111111000111111')
      if (1.0d0-2.0d0*fz.ge.thr)
     &  neighbours=iand(neighbours,b'111111000111111111111111111')
      if (2.0d0-2.0d0*(fy+fz).ge.thr)
     &  neighbours=iand(neighbours,b'000111111111111111111111111')

      thr=1.0d0-fx*fx-fz*fz     !    222222211111111110000000000
      if (0.0d0.ge.thr)         !    654321098765432109876543210
     &  neighbours=iand(neighbours,b'111111111111111111110110110')
      if (1.0d0-2.0d0*fx.ge.thr)
     &  neighbours=iand(neighbours,b'111111111111111111011011011')
      if (1.0d0-2.0d0*fz.ge.thr)
     &  neighbours=iand(neighbours,b'110110110111111111111111111')
      if (2.0d0-2.0d0*(fx+fz).ge.thr)
     &  neighbours=iand(neighbours,b'011011011111111111111111111')

      thr=1.0d0-fx*fx-fy*fy-fz*fz  ! 222222211111111110000000000
      if (0.0d0.ge.thr)            ! 654321098765432109876543210
     &  neighbours=iand(neighbours,b'111111111111111111111111110')
      if (1.0d0-2.0d0*fx.ge.thr)
     &  neighbours=iand(neighbours,b'111111111111111111111111011')
      if (1.0d0-2.0d0*fy.ge.thr)
     &  neighbours=iand(neighbours,b'111111111111111111110111111')
      if (2.0d0-2.0d0*(fx+fy).ge.thr)
     &  neighbours=iand(neighbours,b'111111111111111111011111111')
                                   ! 222222211111111110000000000
      if (1.0d0-2.0d0*fz.ge.thr)   ! 654321098765432109876543210
     &  neighbours=iand(neighbours,b'111111110111111111111111111')
      if (2.0d0-2.0d0*(fx+fz).ge.thr)
     &  neighbours=iand(neighbours,b'111111011111111111111111111')
      if (2.0d0-2.0d0*(fy+fz).ge.thr)
     &  neighbours=iand(neighbours,b'110111111111111111111111111')
      if (3.0d0-2.0d0*(fx+fy+fz).ge.thr)
     &  neighbours=iand(neighbours,b'011111111111111111111111111')

!     write(*,*)'F ',fx,fy,fz
!     write(*,'(a,b32)')'B ',neighbours
      end subroutine

C----------------------------------------------------------------------
!     subroutine calculate_nb_grid(input,ityp,n,
      subroutine calculate_nb_grid(input,abs_ityp,n,
     &                             gridcell,gridrange,griddata)
      use omp_lib
      use grid_arrays
      implicit none
      include "DIMENSIONS"
      include 'COMMON.GRID'

      integer i,j,n,ix,iy,iz,totalcells,cellid,dx,dy,dz,num_before
      integer itypj,s
      double precision input(3,n),ics(3)
      double precision x,y,z,fx,fy,fz,thr
!     integer ityp(maxres)
      integer abs_ityp(maxres)
      integer gridcell(-1:maxres)
      integer gridrange(-2:max_grid_cells)
      integer griddata(27*maxres)
      integer gsize(3)
      logical mrg(3)
      integer mark
      integer*4 neighbours

      integer my_thread,threads_used,range_lo,range_hi,from,to
      integer crange_lo,crange_hi
      integer th_from(max_fg_threads),th_to(max_fg_threads)
      integer th_cfrom(max_fg_threads),th_cto(max_fg_threads)

      gsize=neggridsize(1:3)
      totalcells=gsize(1)*gsize(2)*gsize(3)
      ics=1.0d0/negcellsize(1:3)
      mrg=neggridmerge(1:3)
      
      !range_lo=1
      !range_hi=n
      crange_lo=0
      crange_hi=totalcells

#ifdef _OPENMP
      threads_used=omp_get_max_threads()
#else
      threads_used=1
#endif
!$OMP PARALLEL NUM_THREADS(threads_used) DEFAULT(SHARED)
!$OMP& PRIVATE(my_thread,from,to,i,j,itypj,dx,dy,dz,cellid,
!$OMP&         fx,fy,fz,thr,mark,neighbours)
#ifdef _OPENMP
      my_thread=omp_get_thread_num()+1
#else
      my_thread=1
#endif

      !from=range_lo+((range_hi-range_lo)/threads_used)*(my_thread-1)
      !to=range_lo+((range_hi-range_lo)/threads_used)*my_thread-1
      !if(my_thread.eq.threads_used) to=range_hi
      call split_work_for_threads(from,to,1,n,my_thread,threads_used)
      th_from(my_thread)=from
      th_to(my_thread)=to
      th_grid_count(-1:totalcells,my_thread)=0

      do i=from,to
        !itypj=iabs(ityp(i))
        itypj=abs_ityp(i)
        if (itypj.eq.ntyp1 .or. itypj.eq.0) then
          gridcell(i)=-1
          cycle
        endif
        call getcellid(input(:,i),0,0,0,gridcell(i),gsize,ics,mrg)
        call getcellmask(input(:,i),ics,fx,fy,fz,neighbours)

        do dz=-1,1
          do dy=-1,1
            do dx=-1,1
              mark=iand(neighbours,1)
              neighbours=ishft(neighbours,-1)
              if (mark.eq.0) cycle
              call getcellid(input(:,i),dx,dy,dz,cellid,gsize,ics,mrg)
              !gridrange(2,cellid)=gridrange(2,cellid)+1
              th_grid_count(cellid,my_thread)=
     &                             th_grid_count(cellid,my_thread)+1
            enddo
          enddo
        enddo
      enddo
!$OMP END PARALLEL

!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(my_thread,from,to,i,j,s)
      my_thread=omp_get_thread_num()+1
      !from=crange_lo+((crange_hi-crange_lo)/threads_used)*(my_thread-1)
      !to=crange_lo+((crange_hi-crange_lo)/threads_used)*my_thread-1
      !if(my_thread.eq.threads_used) to=crange_hi
      call split_work_for_threads(from,to,0,totalcells,
     &                            my_thread,threads_used)
      th_cfrom(my_thread)=from
      th_cto(my_thread)=to
      do i=from,to
        s=0
        do j=1,threads_used
          s=s+th_grid_count(i,j)
          th_grid_count(i,j)=s
        enddo
      enddo
!$OMP END PARALLEL

      call iprefix_sum(th_grid_count(0,threads_used),0,totalcells,
     &                 threads_used)
!     write(*,*)'cells=',sum(th_grid_count(1:totalcells,threads_used)),
!    &          '/',totalcells

      gridrange(-2)=0
      gridrange(-1)=0
      dgridptr(-1,:)=0
!$OMP PARALLEL DEFAULT(SHARED) PRIVATE(my_thread,from,to,i,j)
      my_thread=omp_get_thread_num()+1
      from=th_cfrom(my_thread)
      to=th_cto(my_thread)
      do i=from,to
        gridrange(i)=th_grid_count(i,threads_used)

        dgridptr(i,1)=th_grid_count(i-1,threads_used)+1
        do j=2,threads_used
          dgridptr(i,j)=dgridptr(i,1)+th_grid_count(i,j-1)
        enddo
      enddo
!$OMP END PARALLEL
      
!$OMP PARALLEL NUM_THREADS(threads_used) DEFAULT(SHARED)
!$OMP& PRIVATE(my_thread,from,to,i,j,itypj,dx,dy,dz,cellid,
!$OMP&         fx,fy,fz,neighbours,mark)
#ifdef _OPENMP
      my_thread=omp_get_thread_num()+1
#else
      my_thread=1
#endif
      from=th_from(my_thread)
      to=th_to(my_thread)
      do i=from,to
        !itypj=iabs(ityp(i))
        itypj=abs_ityp(i)
        if (itypj.eq.ntyp1 .or. itypj.eq.0) cycle
        call getcellmask(input(:,i),ics,fx,fy,fz,neighbours)
        do dz=-1,1
          do dy=-1,1
            do dx=-1,1
              mark=iand(neighbours,1)
              neighbours=ishft(neighbours,-1)
              if (mark.eq.0) cycle
              call getcellid(input(:,i),dx,dy,dz,cellid,gsize,ics,mrg)
              j=dgridptr(cellid,my_thread)
              griddata(j)=i
              dgridptr(cellid,my_thread)=j+1
            enddo
          enddo
        enddo
      enddo
!$OMP END PARALLEL

      end subroutine

C----------------------------------------------------------------------
!      subroutine calculate_positive_grid(input,ityp,n,grid)
!#ifdef _OPENMP
!      use omp_lib
!#endif
!      implicit none
!      include 'DIMENSIONS'
!      include 'COMMON.CHAIN'
!      include 'COMMON.GRID'
!      integer i,j,n,ix,iy,iz,cellid,itypj
!      integer range_lo,range_hi,from,to,my_thread,threads_used
!      double precision x,y,z,invcellsize(3)
!      integer grid(grid_length),ityp(maxres),gridsize(3)
!      double precision input(3,n)
!      integer lastcell,lastid,seqlen
!      integer numseq,sumseq,mxseq
!
!      gridsize=posgridsize
!
!      invcellsize=1.0d0/poscellsize
!
!      range_lo=1
!      range_hi=n
!      threads_used=omp_get_max_threads()
!!$OMP PARALLEL DEFAULT(SHARED)
!!$OMP& PRIVATE(i,j,x,y,z,ix,iy,iz,itypj,cellid,lastcell,lastid,seqlen,
!!$OMP&         my_thread)
!      my_thread=omp_get_thread_num()+1
!      from=range_lo+((range_hi-range_lo)/threads_used)*(my_thread-1)
!      to=range_lo+((range_hi-range_lo)/threads_used)*my_thread-1
!      if(my_thread.eq.threads_used) to=range_hi
!
!      lastcell=from
!      lastid=-1
!      seqlen=0
!
!      do i=from,to
!        x=input(1,i)
!        y=input(2,i)
!        z=input(3,i)
!        ix=idint(x*invcellsize(1))
!        iy=idint(y*invcellsize(2))
!        iz=idint(z*invcellsize(3))
!        !cellid=ix+(iy+iz*gridsize(2))*gridsize(1)
!        cellid=ix+ishft(iy,SHIFT)+ishft(iz,2*SHIFT)
!!       write(*,*)'GS',x,y,z
!!       write(*,*),'GI',x*invcellsize(1),y*invcellsize(2),
!!    &                  z*invcellsize(3)
!!       write(*,*),'GG',ix,iy,iz,cellid
!
!        itypj=iabs(ityp(i))
!        if (itypj.eq.ntyp1 .or. itypj.eq.0) cellid=-1
!
!        if(cellid.eq.lastid) then
!          seqlen=seqlen+1
!        else
!          do j=lastcell,i
!            grid(j+j+1)=seqlen-(j-lastcell)
!          enddo
!          lastcell=i
!!         sumseq=sumseq+seqlen
!!         mxseq=max(mxseq,seqlen)
!!         numseq=numseq+1
!          seqlen=1
!        endif
!        grid(i*2)=cellid
!        grid(i*2+1)=1
!
!        lastid=cellid
!      enddo
!!$OMP END PARALLEL
!
!!     if(seqlen.gt.0) then
!!       sumseq=sumseq+seqlen
!!       mxseq=max(mxseq,seqlen)
!!       numseq=numseq+1
!!     endif
!!     write(*,*)'SEQ',numseq,dble(sumseq)/numseq,mxseq
!!     do i=1,n
!!       write(*,*)'D',i,grid(i*2),grid(i*2+1)
!!     enddo
!      end subroutine
!
C----------------------------------------------------------------------
      subroutine calculate_grids
      use grid_arrays
      implicit none
      include 'DIMENSIONS'
      include 'COMMON.CHAIN'
      include 'COMMON.INTERACT'
      include 'COMMON.SPLITELE'
      include 'COMMON.GRID'
      include 'COMMON.OMP'

      call calculate_nb_grid(cp(1,1),abs_itel,nres,
     &                       gridcellsc,dgridrangesc,griddatasc)
      call calculate_nb_grid(c_tobox(1,1),abs_itype,nres,
     &                       gridcellp,dgridrangep,griddatap)
      call calculate_nb_grid(c_tobox(1,nres+1),abs_itype,nres,
     &                       gridcellpn,dgridrangepn,griddatapn)

!     call calculate_positive_grid(cp(1,1),itel,nres,
!    &                             positive_sc)
!     call calculate_positive_grid(c_tobox(1,1),itype,nres,
!    &                             positive_p)
!     call calculate_positive_grid(c_tobox(1,nres+1),itype,nres,
!    &                             positive_pn)
      end subroutine

C----------------------------------------------------------------------
      subroutine grid_stat_report
      implicit none
      include 'DIMENSIONS'
      include 'COMMON.GRID'
      integer i
      write (*,*)"GRID"
      write (*,*)"  count          ",gridallcount
      write (*,*)"  passes         ",gridpass
      write (*,*)"  misses         ",gridmiss
      write (*,*)"  typect         ",gridtypecuts
      write (*,*)"  positive hits  ",posgridhits
      write (*,*)"  positive jumps ",posgridjumps

      write (*,*)"  negative hits  ",neggridhits
      end subroutine
