diff --git a/tools/fregrid_gpu/conserve_interp_gpu.c b/tools/fregrid_gpu/conserve_interp_gpu.c index e8426067..26701ea2 100644 --- a/tools/fregrid_gpu/conserve_interp_gpu.c +++ b/tools/fregrid_gpu/conserve_interp_gpu.c @@ -18,6 +18,7 @@ * . **********************************************************************/ #include +#include #include #include #include @@ -51,7 +52,6 @@ void setup_conserve_interp_gpu(int ntiles_input_grid, Grid_config *input_grid, i int nlon_output_cells = output_grid[otile].nxc; int nlat_output_cells = output_grid[otile].nyc; - int ncells_output_grid = nlon_output_cells*nlat_output_cells; int ngridpts_output_grid = (nlon_output_cells+1)*(nlat_output_cells+1); Grid_cells_struct_config output_grid_cells; @@ -80,7 +80,9 @@ void setup_conserve_interp_gpu(int ntiles_input_grid, Grid_config *input_grid, i get_bounding_indices_gpu(nlon_output_cells, nlat_output_cells, nlon_input_cells, nlat_input_cells, output_grid[otile].latc, input_grid[itile].latc, &jlat_overlap_starts, &jlat_overlap_ends); - create_upbound_nxcells_arrays_on_device_gpu( ncells_input_grid, &approx_nxcells_per_ij1, &ij2_start, &ij2_end); + approx_nxcells_per_ij1 = (int *) acc_malloc(ncells_input_grid*sizeof(int)); + ij2_start = (int *) acc_malloc(ncells_input_grid*sizeof(int)); + ij2_end = (int *) acc_malloc(ncells_input_grid*sizeof(int)); upbound_nxcells = get_upbound_nxcells_2dx2d_gpu( nlon_input_cells, nlat_input_cells, nlon_output_cells, nlat_output_cells, @@ -126,13 +128,25 @@ void setup_conserve_interp_gpu(int ntiles_input_grid, Grid_config *input_grid, i else mpp_error("conserve_interp: interp_method should be CONSERVE_ORDER1 or CONSERVE_ORDER2"); } //conserve_order methods - free_upbound_nxcells_arrays_gpu(ncells_input_grid, &approx_nxcells_per_ij1, &ij2_start, &ij2_end); + acc_free(approx_nxcells_per_ij1); approx_nxcells_per_ij1 = NULL; + acc_free(ij2_start); ij2_start = NULL; + acc_free(ij2_end); ij2_end = NULL; + free_input_grid_mask_gpu(ncells_input_grid, &input_grid_mask); delete_grid_from_device_gpu(ngridpts_input_grid, input_grid[itile].lonc, input_grid[itile].latc); } //input tile - free_grid_cell_struct_gpu( ncells_output_grid, &output_grid_cells); + + acc_free(output_grid_cells.lon_min); output_grid_cells.lon_min = NULL; + acc_free(output_grid_cells.lon_max); output_grid_cells.lon_max = NULL; + acc_free(output_grid_cells.lon_cent); output_grid_cells.lon_cent = NULL; + acc_free(output_grid_cells.lat_min); output_grid_cells.lat_min = NULL; + acc_free(output_grid_cells.lat_max); output_grid_cells.lat_max = NULL; + acc_free(output_grid_cells.area); output_grid_cells.area = NULL; + acc_free(output_grid_cells.nvertices); output_grid_cells.nvertices=NULL; + acc_free(output_grid_cells.lon_vertices); output_grid_cells.lon_vertices = NULL; + acc_free(output_grid_cells.lat_vertices); output_grid_cells.lat_vertices = NULL; delete_grid_from_device_gpu(ngridpts_output_grid, output_grid[otile].lonc, output_grid[otile].latc); }//output tile @@ -479,16 +493,14 @@ void do_scalar_conserve_interp_gpu(Interp_config_gpu *interp_gpu, int varid, int int ncells_output_grid = output_grid[otile].nxc * output_grid[otile].nyc; double *p_fieldout_data = NULL; p_fieldout_data = field_out[otile].data; - int *out_miss = NULL ; out_miss = (int *)malloc(ncells_output_grid*sizeof(int)); - double *out_area = NULL ; out_area = (double *)malloc(ncells_output_grid*sizeof(double)); -#pragma acc enter data create(p_fieldout_data[:ncells_output_grid], \ - out_area[:ncells_output_grid], \ - out_miss[:ncells_output_grid]) + int *out_miss = NULL ; out_miss = (int *) acc_malloc(ncells_output_grid*sizeof(int)); + double *out_area = NULL ; out_area = (double *) acc_malloc(ncells_output_grid*sizeof(double)); -#pragma acc parallel loop present(p_fieldout_data[:ncells_output_grid], \ - out_area[:ncells_output_grid], \ - out_miss[:ncells_output_grid]) +#pragma acc enter data create(p_fieldout_data[:ncells_output_grid]) + +#pragma acc parallel loop present(p_fieldout_data[:ncells_output_grid]) deviceptr(out_area, \ + out_miss) for(int i=0; i 0) gsum_out += p_fieldout_data[i]; } } if ( cell_methods == CELL_METHODS_SUM ) { -#pragma acc parallel loop present(out_area[:ncells_output_grid], \ - out_miss[:ncells_output_grid], \ - p_fieldout_data[:ncells_output_grid]) +#pragma acc parallel loop present(p_fieldout_data[:ncells_output_grid]) deviceptr(out_miss) + for(int i=0; i 0) { p_fieldout_data[i] /= out_area[i]; @@ -553,7 +562,7 @@ void do_scalar_conserve_interp_gpu(Interp_config_gpu *interp_gpu, int varid, int } if( (target_grid) ) { -#pragma acc parallel loop present(out_area[:ncells_output_grid]) +#pragma acc parallel loop deviceptr(out_area) for(int i=0; inxcells; #pragma acc parallel loop present(minterp_gpu->output_parent_cell_index[:ixcells], \ minterp_gpu->input_parent_cell_index[:ixcells], \ - minterp_gpu->xcell_area[:ixcells], \ - out_area[:ncells_output_grid])\ + minterp_gpu->xcell_area[:ixcells])\ copyin(p_fieldin_area[:ncells_input_grid],\ - p_gridin_area[:ncells_input_grid]) + p_gridin_area[:ncells_input_grid])\ + deviceptr(out_area) for(int ix=0; ixoutput_parent_cell_index[ix]; int ij1 = minterp_gpu->input_parent_cell_index[ix]; @@ -575,8 +584,9 @@ void do_scalar_conserve_interp_gpu(Interp_config_gpu *interp_gpu, int varid, int if(cell_measures ) out_area[ij2] += (area*p_fieldin_area[ij1]/p_gridin_area[ij1]); else out_area[ij2] += area; } -#pragma acc parallel loop present(p_fieldout_data[:ncells_output_grid], out_area[:ncells_output_grid], \ - output_grid[otile].cell_area[:ncells_output_grid]) +#pragma acc parallel loop present(p_fieldout_data[:ncells_output_grid], \ + output_grid[otile].cell_area[:ncells_output_grid])\ + deviceptr(out_area) for(int i=0; iweight; if(cell_methods == CELL_METHODS_SUM) { -#pragma acc parallel loop present(input_area_weight[:ncells_input_grid]) copyin(p_gridin_area[:ncells_input_grid]) +#pragma acc parallel loop deviceptr(input_area_weight) copyin(p_gridin_area[:ncells_input_grid]) for(int i=0 ; iinput_parent_cell_index[:nxcells], \ minterp_gpu->output_parent_cell_index[:nxcells], \ minterp_gpu->xcell_area[:nxcells], \ - input_area_weight[:ncells_input_grid], \ - fieldout_data[:ncells_output_grid], \ - out_area[:ncells_output_grid], \ - out_miss[:ncells_output_grid]) \ + fieldout_data[:ncells_output_grid]) \ copyin(fieldin_data[:ncells_input_grid]) -#pragma acc parallel loop +#pragma acc parallel loop deviceptr(input_area_weight, \ + out_area, \ + out_miss) for(int ix=0; ixinput_parent_cell_index[ix]; int ij2 = minterp_gpu->output_parent_cell_index[ix]; @@ -725,14 +733,13 @@ void interp_data_order2( const Grid_config *output_grid, const Grid_config *inpu minterp_gpu->input_parent_cell_index[:nxcells], \ minterp_gpu->output_parent_cell_index[:nxcells], \ minterp_gpu->xcell_area[:nxcells], \ - input_area_weight[:ncells_input_grid], \ - fieldout_data[:ncells_output_grid], \ - out_area[:ncells_output_grid], \ - out_miss[:ncells_output_grid]) \ + fieldout_data[:ncells_output_grid]) \ copyin(fieldin_data[:input_data_ncells], \ grad_mask[:ncells_input_grid], \ grad_x[:ncells_input_grid], grad_y[:ncells_input_grid]) -#pragma acc parallel loop +#pragma acc parallel loop deviceptr(input_area_weight, \ + out_area, \ + out_miss) for(int ix=0; ixinput_parent_cell_index[ix]; int ij2 = minterp_gpu->output_parent_cell_index[ix]; diff --git a/tools/fregrid_gpu/interp_utils_gpu.c b/tools/fregrid_gpu/interp_utils_gpu.c index 54015ece..1c0d45aa 100644 --- a/tools/fregrid_gpu/interp_utils_gpu.c +++ b/tools/fregrid_gpu/interp_utils_gpu.c @@ -78,15 +78,13 @@ void get_input_grid_mask_gpu(const int mask_size, double **input_grid_mask) { double *p_input_grid_mask; - - *input_grid_mask = (double *)malloc(mask_size*sizeof(double)); + *input_grid_mask = (double*) acc_malloc(mask_size*sizeof(double)); p_input_grid_mask = *input_grid_mask; - -#pragma acc enter data create(p_input_grid_mask[:mask_size]) -#pragma acc parallel loop independent present(p_input_grid_mask[:mask_size]) +#pragma acc parallel loop independent deviceptr(p_input_grid_mask) for( int i=0 ; ilon_min, \ + output_grid_cells->lon_max, \ + output_grid_cells->lat_min, \ + output_grid_cells->lat_max, \ + output_grid_cells->lon_cent, \ + output_grid_cells->area, \ + output_grid_cells->nvertices, \ + output_grid_cells->lon_vertices, \ + output_grid_cells->lat_vertices, \ + skip_input_cells, \ + approx_xcells_per_ij1, \ + ij2_start, \ + ij2_end) + for( int ij1=ij1_start; ij1 MASK_THRESH ) { int i_approx_xcells_per_ij1=0; @@ -157,32 +165,36 @@ int create_xgrid_2dx2d_order1_gpu(const int nlon_input_cells, const int nlat_in double xcell_dclon=-99.99, xcell_dclat=-99.99; - int *parent_input_index = NULL ; parent_input_index = (int *)malloc(upbound_nxcells*sizeof(int)); - int *parent_output_index = NULL ; parent_output_index = (int *)malloc(upbound_nxcells*sizeof(int)); - int *nxcells_per_ij1 = NULL ; nxcells_per_ij1 = (int *)malloc(input_grid_ncells*sizeof(int)); - double *store_xcell_area = NULL ; store_xcell_area = (double *)malloc(upbound_nxcells*sizeof(double)); + int *parent_input_index = NULL; parent_input_index = (int*) acc_malloc(upbound_nxcells*sizeof(int)); + int *parent_output_index = NULL; parent_output_index = (int*) acc_malloc(upbound_nxcells*sizeof(int)); + int *nxcells_per_ij1 = NULL; nxcells_per_ij1 = (int*) acc_malloc(input_grid_ncells*sizeof(int)); + double *store_xcell_area = NULL; store_xcell_area = (double*) acc_malloc(upbound_nxcells*sizeof(double)); -#pragma acc enter data create(parent_input_index[:upbound_nxcells], \ - parent_output_index[:upbound_nxcells], \ - store_xcell_area[:upbound_nxcells], \ - nxcells_per_ij1[:input_grid_ncells]) #pragma acc data present(output_grid_lon[:output_grid_npts], \ output_grid_lat[:output_grid_npts], \ input_grid_lon[:input_grid_npts], \ - input_grid_lat[:input_grid_npts], \ - output_grid_cells[:1], \ - approx_nxcells_per_ij1[:input_grid_ncells], \ - ij2_start[:input_grid_ncells], \ - ij2_end[:input_grid_ncells], \ - mask_input_grid[:input_grid_ncells], \ - nxcells_per_ij1[:input_grid_ncells], \ - parent_input_index[:upbound_nxcells], \ - parent_output_index[:upbound_nxcells], \ - store_xcell_area[:upbound_nxcells]) \ + input_grid_lat[:input_grid_npts]) \ copyin(input_grid_ncells, output_grid_ncells) \ copy(nxcells) -#pragma acc parallel loop reduction(+:nxcells) +#pragma acc parallel loop reduction(+:nxcells) \ + deviceptr(nxcells_per_ij1, \ + store_xcell_area, \ + parent_input_index, \ + parent_output_index, \ + output_grid_cells->lon_min, \ + output_grid_cells->lon_max, \ + output_grid_cells->lat_min, \ + output_grid_cells->lat_max, \ + output_grid_cells->lon_cent, \ + output_grid_cells->area, \ + output_grid_cells->nvertices, \ + output_grid_cells->lon_vertices, \ + output_grid_cells->lat_vertices, \ + approx_nxcells_per_ij1, \ + mask_input_grid, \ + ij2_start, \ + ij2_end) for(int ij1=ij1_start; ij1 MASK_THRESH) { @@ -261,15 +273,11 @@ int create_xgrid_2dx2d_order1_gpu(const int nlon_input_cells, const int nlat_in approx_nxcells_per_ij1, parent_input_index, parent_output_index, store_xcell_area, interp_for_itile); -#pragma acc exit data delete( parent_input_index[:upbound_nxcells], \ - parent_output_index[:upbound_nxcells], \ - store_xcell_area[:upbound_nxcells], \ - nxcells_per_ij1[:input_grid_ncells]) + acc_free(parent_input_index) ; parent_input_index = NULL; + acc_free(parent_output_index); parent_output_index = NULL; + acc_free(nxcells_per_ij1) ; nxcells_per_ij1 = NULL; + acc_free(store_xcell_area) ; store_xcell_area = NULL; - free(parent_input_index) ; parent_input_index = NULL; - free(parent_output_index); parent_output_index = NULL; - free(nxcells_per_ij1) ; nxcells_per_ij1 = NULL; - free(store_xcell_area) ; store_xcell_area = NULL; return nxcells; @@ -303,44 +311,45 @@ int create_xgrid_2dx2d_order2_gpu(const int nlon_input_cells, const int nlat_in int ij1_start = jlat_overlap_starts*nlon_input_cells; int ij1_end = (jlat_overlap_ends+1)*nlon_input_cells; - int *parent_input_index=NULL ; parent_input_index = (int *)malloc(upbound_nxcells*sizeof(int)); - int *parent_output_index=NULL ; parent_output_index = (int *)malloc(upbound_nxcells*sizeof(int)); - double *store_xcell_area=NULL ; store_xcell_area = (double *)malloc(upbound_nxcells*sizeof(double)); - double *store_xcell_dclon=NULL ; store_xcell_dclon = (double *)malloc(upbound_nxcells*sizeof(double)); - double *store_xcell_dclat=NULL ; store_xcell_dclat = (double *)malloc(upbound_nxcells*sizeof(double)); - - int *nxcells_per_ij1=NULL ; nxcells_per_ij1 = (int *)malloc(input_grid_ncells*sizeof(int)); - double *summed_input_area=NULL; summed_input_area = (double *)malloc(input_grid_ncells*sizeof(double)); - double *summed_input_clat=NULL; summed_input_clat = (double *)malloc(input_grid_ncells*sizeof(double)); - double *summed_input_clon=NULL; summed_input_clon = (double *)malloc(input_grid_ncells*sizeof(double)); - -#pragma acc enter data create(parent_input_index[:upbound_nxcells], \ - parent_output_index[:upbound_nxcells], \ - store_xcell_area[:upbound_nxcells], \ - nxcells_per_ij1[:input_grid_ncells], \ - store_xcell_dclon[:upbound_nxcells], \ - store_xcell_dclat[:upbound_nxcells], \ - summed_input_area[:input_grid_ncells], \ - summed_input_clon[:input_grid_ncells], \ - summed_input_clat[:input_grid_ncells]) + int *parent_input_index=NULL ; parent_input_index = (int *) acc_malloc(upbound_nxcells*sizeof(int)); + int *parent_output_index=NULL ; parent_output_index = (int *)acc_malloc(upbound_nxcells*sizeof(int)); + double *store_xcell_area=NULL ; store_xcell_area = (double *) acc_malloc(upbound_nxcells*sizeof(double)); + double *store_xcell_dclon=NULL ; store_xcell_dclon = (double *) acc_malloc(upbound_nxcells*sizeof(double)); + double *store_xcell_dclat=NULL ; store_xcell_dclat = (double *) acc_malloc(upbound_nxcells*sizeof(double)); + + int *nxcells_per_ij1=NULL ; nxcells_per_ij1 = (int *) acc_malloc(input_grid_ncells*sizeof(int)); + double *summed_input_area=NULL; summed_input_area = (double *) acc_malloc(input_grid_ncells*sizeof(double)); + double *summed_input_clat=NULL; summed_input_clat = (double *) acc_malloc(input_grid_ncells*sizeof(double)); + double *summed_input_clon=NULL; summed_input_clon = (double *) acc_malloc(input_grid_ncells*sizeof(double)); + #pragma acc data present(output_grid_lon[:output_grid_npts], \ output_grid_lat[:output_grid_npts], \ input_grid_lon[:input_grid_npts], \ - input_grid_lat[:input_grid_npts], \ - output_grid_cells[:1], \ - approx_nxcells_per_ij1[:input_grid_ncells], \ - ij2_start[:input_grid_ncells], \ - ij2_end[:input_grid_ncells], \ - mask_input_grid[:input_grid_ncells], \ - nxcells_per_ij1[:input_grid_ncells], \ - parent_input_index[:upbound_nxcells], \ - parent_output_index[:upbound_nxcells], \ - store_xcell_area[:upbound_nxcells], \ - store_xcell_dclon[:upbound_nxcells], \ - store_xcell_dclat[:upbound_nxcells]) \ + input_grid_lat[:input_grid_npts]) \ copyin(input_grid_ncells, output_grid_ncells) copy(nxcells) -#pragma acc parallel loop reduction(+:nxcells) +#pragma acc parallel loop reduction(+:nxcells) \ + deviceptr(parent_input_index, \ + parent_output_index, \ + store_xcell_area, \ + store_xcell_dclon, \ + store_xcell_dclat, \ + summed_input_area, \ + summed_input_clon, \ + summed_input_clat, \ + nxcells_per_ij1, \ + output_grid_cells->lon_min, \ + output_grid_cells->lon_max, \ + output_grid_cells->lat_min, \ + output_grid_cells->lon_cent, \ + output_grid_cells->area, \ + output_grid_cells->nvertices, \ + output_grid_cells->lon_vertices, \ + output_grid_cells->lat_vertices, \ + approx_nxcells_per_ij1, \ + mask_input_grid, \ + ij2_start, \ + ij2_end) for(int ij1=ij1_start; ij1 MASK_THRESH) { @@ -366,7 +375,8 @@ int create_xgrid_2dx2d_order2_gpu(const int nlon_input_cells, const int nlat_in #pragma acc loop seq reduction(+:ixcell) \ reduction(+:summed_input_area_ij1) \ reduction(+:summed_input_clon_ij1) \ - reduction(+:summed_input_clat_ij1) + reduction(+:summed_input_clat_ij1) \ + for(int ij2=ij2_start[ij1]; ij2<=ij2_end[ij1]; ij2++) { int nvertices2, xvertices=1; @@ -439,11 +449,10 @@ int create_xgrid_2dx2d_order2_gpu(const int nlon_input_cells, const int nlat_in interp_for_itile->dcentroid_lat[:nxcells], \ input_grid_lon[:input_grid_ncells], \ input_grid_lat[:input_grid_ncells], \ - summed_input_area[:input_grid_ncells], \ - summed_input_clon[:input_grid_ncells], \ - summed_input_clat[:input_grid_ncells], \ interp_for_itile->input_parent_cell_index[:nxcells]) \ - copyin(readin_input_area[:input_grid_ncells]) + copyin(readin_input_area[:input_grid_ncells]) deviceptr(summed_input_area, \ + summed_input_clon, \ + summed_input_clat) for(int ix=0 ; ixinput_parent_cell_index[ix]; double input_area = summed_input_area[ij1]; @@ -463,25 +472,15 @@ int create_xgrid_2dx2d_order2_gpu(const int nlon_input_cells, const int nlat_in interp_for_itile->dcentroid_lat[ix] -= input_clat/input_area; } -#pragma acc exit data delete( parent_input_index[:upbound_nxcells], \ - parent_output_index[:upbound_nxcells], \ - store_xcell_area[:upbound_nxcells], \ - nxcells_per_ij1[:input_grid_ncells], \ - store_xcell_dclon[:upbound_nxcells], \ - store_xcell_dclat[:upbound_nxcells], \ - summed_input_area[:input_grid_ncells], \ - summed_input_clon[:input_grid_ncells], \ - summed_input_clat[:input_grid_ncells]) - - free(parent_input_index) ; parent_input_index = NULL; - free(parent_output_index) ; parent_output_index = NULL; - free(store_xcell_area) ; store_xcell_area = NULL; - free(nxcells_per_ij1) ; nxcells_per_ij1 = NULL; - free(store_xcell_dclon) ; store_xcell_dclon = NULL; - free(store_xcell_dclat) ; store_xcell_dclat = NULL; - free(summed_input_area); summed_input_area = NULL; - free(summed_input_clon); summed_input_clon = NULL; - free(summed_input_clat); summed_input_clat = NULL; + acc_free(parent_input_index); parent_input_index = NULL; + acc_free(parent_output_index); parent_output_index = NULL; + acc_free(store_xcell_area); store_xcell_area = NULL; + acc_free(nxcells_per_ij1); nxcells_per_ij1 = NULL; + acc_free(store_xcell_dclon); store_xcell_dclon = NULL; + acc_free(store_xcell_dclat); store_xcell_dclat = NULL; + acc_free(summed_input_area); summed_input_area = NULL; + acc_free(summed_input_clon); summed_input_clon = NULL; + acc_free(summed_input_clat); summed_input_clat = NULL; return nxcells; diff --git a/tools/libfrencutils_gpu/create_xgrid_utils_gpu.c b/tools/libfrencutils_gpu/create_xgrid_utils_gpu.c index 706c4cb9..291d72b7 100644 --- a/tools/libfrencutils_gpu/create_xgrid_utils_gpu.c +++ b/tools/libfrencutils_gpu/create_xgrid_utils_gpu.c @@ -653,26 +653,26 @@ void get_grid_cell_struct_gpu( const int nlon, const int nlat, const Grid_config int ncells=nlon*nlat; int npts=(nlon+1)*(nlat+1); - grid_cells->lon_min = (double *)malloc(ncells*sizeof(double)); - grid_cells->lon_max = (double *)malloc(ncells*sizeof(double)); - grid_cells->lat_min = (double *)malloc(ncells*sizeof(double)); - grid_cells->lat_max = (double *)malloc(ncells*sizeof(double)); - grid_cells->lon_cent = (double *)malloc(ncells*sizeof(double)); - grid_cells->area = (double *)malloc(ncells*sizeof(double)); - grid_cells->nvertices = (int *)malloc(ncells*sizeof(int)); - grid_cells->lon_vertices = (double *)malloc(MAX_V*ncells*sizeof(double)); - grid_cells->lat_vertices = (double *)malloc(MAX_V*ncells*sizeof(double)); - -#pragma acc enter data create(grid_cells[:1]) -#pragma acc enter data create(grid_cells->lon_min[:ncells], grid_cells->lon_max[:ncells], \ - grid_cells->lat_min[:ncells], grid_cells->lat_max[:ncells], \ - grid_cells->lon_cent[:ncells], grid_cells->nvertices[:ncells],\ - grid_cells->area[:ncells]) -#pragma acc enter data create(grid_cells->lon_vertices[:MAX_V*ncells], \ - grid_cells->lat_vertices[:MAX_V*ncells]) - -#pragma acc data present(grid_cells[:1], lon[:npts], lat[:npts]) -#pragma acc parallel loop independent + grid_cells->lon_min = (double *) acc_malloc(ncells*sizeof(double)); + grid_cells->lon_max = (double *) acc_malloc(ncells*sizeof(double)); + grid_cells->lat_min = (double *) acc_malloc(ncells*sizeof(double)); + grid_cells->lat_max = (double *) acc_malloc(ncells*sizeof(double)); + grid_cells->lon_cent = (double *) acc_malloc(ncells*sizeof(double)); + grid_cells->area = (double *) acc_malloc(ncells*sizeof(double)); + grid_cells->nvertices = (int *) acc_malloc(ncells*sizeof(int)); + grid_cells->lon_vertices = (double *) acc_malloc(MAX_V*ncells*sizeof(double)); + grid_cells->lat_vertices = (double *) acc_malloc(MAX_V*ncells*sizeof(double)); + +#pragma acc data present(lon[:npts], lat[:npts]) +#pragma acc parallel loop independent deviceptr(grid_cells->lon_min, \ + grid_cells->lon_max, \ + grid_cells->lat_min, \ + grid_cells->lat_max, \ + grid_cells->lon_cent, \ + grid_cells->area, \ + grid_cells->nvertices, \ + grid_cells->lon_vertices, \ + grid_cells->lat_vertices) for(int icell=0; icelllon_vertices, \ - grid_cells->lat_vertices, \ - grid_cells->lon_min, \ - grid_cells->lon_max, \ - grid_cells->lon_cent, \ - grid_cells->lat_max, \ - grid_cells->lat_min, \ - grid_cells->nvertices, \ - grid_cells->area) -#pragma acc exit data delete(grid_cells) - - free(grid_cells->lon_min); grid_cells->lon_min = NULL; - free(grid_cells->lon_max); grid_cells->lon_max = NULL; - free(grid_cells->lon_cent); grid_cells->lon_cent = NULL; - free(grid_cells->lat_min); grid_cells->lat_min = NULL; - free(grid_cells->lat_max); grid_cells->lat_max = NULL; - free(grid_cells->area); grid_cells->area = NULL; - free(grid_cells->nvertices); grid_cells->nvertices=NULL; - free(grid_cells->lon_vertices); grid_cells->lon_vertices = NULL; - free(grid_cells->lat_vertices); grid_cells->lat_vertices = NULL; -} - - void get_cell_vertices_gpu( const int icell, const int nlon, const double *lon, const double *lat, double *x, double *y ) { @@ -746,48 +720,6 @@ void get_cell_vertices_gpu( const int icell, const int nlon, const double *lon, } -void create_upbound_nxcells_arrays_on_device_gpu(const int n, int **approx_nxcells_per_ij1, - int **ij2_start, int **ij2_end) -{ - - int *p_approx_nxcells_per_ij1; - int *p_ij2_start; - int *p_ij2_end; - - *approx_nxcells_per_ij1 = (int *)malloc(n*sizeof(int)); - *ij2_start = (int *)malloc(n*sizeof(int)); - *ij2_end = (int *)malloc(n*sizeof(int)); - - p_approx_nxcells_per_ij1 = *approx_nxcells_per_ij1; - p_ij2_start = *ij2_start; - p_ij2_end = *ij2_end; - -#pragma acc enter data copyin(p_approx_nxcells_per_ij1[:n], \ - p_ij2_start[:n], \ - p_ij2_end[:n]) - -} - -void free_upbound_nxcells_arrays_gpu( const int n, int **approx_nxcells_per_ij1, - int **ij2_start, int **ij2_end) -{ - int *p_approx_nxcells_per_ij1; - int *p_ij2_start ; - int *p_ij2_end; - - p_approx_nxcells_per_ij1 = *approx_nxcells_per_ij1; - p_ij2_start = *ij2_start; - p_ij2_end = *ij2_end; - -#pragma acc exit data delete(p_approx_nxcells_per_ij1[:n], \ - p_ij2_start[:n], \ - p_ij2_end[:n]) - - free(*approx_nxcells_per_ij1); *approx_nxcells_per_ij1 = NULL; - free(*ij2_start) ; *ij2_start = NULL; - free(*ij2_end) ; *ij2_end = NULL; -} - void copy_data_to_interp_on_device_gpu(const int nxcells, const int input_ncells, const int upbound_nxcells, int *xcells_per_ij1, double *xcell_dclon, double *xcell_dclat, int *approx_xcells_per_ij1, int *parent_input_index, int *parent_output_index, @@ -813,11 +745,12 @@ void copy_data_to_interp_on_device_gpu(const int nxcells, const int input_ncells #pragma acc enter data if(copy_xcentroid) create(interp_for_input_tile->dcentroid_lon[:nxcells], \ interp_for_input_tile->dcentroid_lat[:nxcells]) -#pragma acc data present(xcells_per_ij1[:input_ncells], approx_xcells_per_ij1[:input_ncells], \ - parent_input_index[:upbound_nxcells], \ - parent_output_index[:upbound_nxcells], \ - xcell_areas[:upbound_nxcells], interp_for_input_tile[:1]) -#pragma acc parallel loop independent async(0) +#pragma acc data present(interp_for_input_tile[:1]) +#pragma acc parallel loop independent async(0) deviceptr(xcells_per_ij1, \ + approx_xcells_per_ij1, \ + parent_input_index, \ + parent_output_index, \ + xcell_areas) for(int ij1=0 ; ij1