12.8.16

MPI_Bcast large dynamic char array

SysAdmins hate it when all your MPI procs access their precious disks.  The preferred procedure is to read from the disk once, using a single proc, then pass the data to all of the other procs.  In principle MPI_Bcast makes this easy.  In practice...you decide.

I needed to do this for arbitrary file sizes of a gigabyte or more containing string data.  The process is conceptually simple:

1. Use the root process (proc 0) to get the file size.
2. MPI_Bcast this file size to all procs.
3. Initialize and size a char array to contain the file data on all procs.
4. Use proc 0 to read the data file.
5. MPI_Bcast the file data to all procs.

Here is a prototype.
Save:  testmpi.cpp.
Compile:  mpic++ -o t testmpi.cpp.
Run:  mpirun -np 8 t yourbigfile.txt
-------
#include <algorithm>
#include <fstream>
#include <mpi.h>
#include <stdio.h>
#include <string.h>

using namespace std;

//determine the size of a file
std::ifstream::pos_type filesize(const char* filename)
{
std::ifstream in(filename, std::ifstream::ate | std::ifstream::binary);
return in.tellg(); 
}

//quickly reads a large .dat file into a memory buffer
char * MyBigRead(char* DatFilePath)
{
FILE * pFile;
unsigned long long lSize;
char * buffer;
size_t result;

pFile = fopen ( DatFilePath , "r" );
if (pFile==NULL) {fputs ("File error",stderr); exit (1);}

// obtain file size:
fseek (pFile , 0 , SEEK_END);
lSize = ftell (pFile);
rewind (pFile);

// allocate memory to contain the whole file:
buffer = (char*) malloc (sizeof(char)*lSize);
if (buffer == NULL) {fputs ("Memory error",stderr); exit (2);}

// copy the file into the buffer:
result = fread (buffer,1,lSize,pFile);
if (result != lSize) {fputs ("Reading error",stderr); exit (3);}

fclose (pFile);
return buffer;
}

int main(int argc, char* argv[]) {

MPI::Init ();
int procid = MPI::COMM_WORLD.Get_rank ( );  //Get the individual process ID.
int nprocs = MPI::COMM_WORLD.Get_size ( );

unsigned long long f = 0; //dat file size

//get the size of the dat file using proc 0
if (procid == 0)
{
f = (unsigned long long)filesize(argv[1]); 
f = f + 1; //increase by 1 to accomodate \0 string terminator
}

//broadcast file size to all procs
MPI_Bcast(&f, 1, MPI_UNSIGNED_LONG_LONG, 0, MPI_COMM_WORLD);

//initialize and size the data structure to hold contents of dat file
char * DatFileBuffer = (char*)malloc(f);

//report before MPI_Bcast'ing the data
printf("[proc%d]Before: snippet of DatFileBuffer:>%.5s<, size:%d\n", procid, DatFileBuffer, f);

//read the dat file from disk using proc 0
if (procid == 0)
{
char * d = MyBigRead(argv[1]);

//convert the char* to a char array
strcpy(DatFileBuffer, d); //copy data read into char * d to the pre-sized DatFileBuffer
}

//broadcast the dat file contents to all procs
MPI_Bcast(&DatFileBuffer[0], f, MPI_CHAR, 0, MPI_COMM_WORLD);

//report after MPI_Bcast'ing the data
printf("[proc%d]After: snippet of DatFileBuffer:>%.10s<, size:%d\n", procid, DatFileBuffer, f);

MPI_Finalize();
return 0;
}
-------