SysAdmins hate it when all your MPI procs access their precious disks. The preferred procedure is to read from the disk once, using a single proc, then pass the data to all of the other procs. In principle MPI_Bcast makes this easy. In practice...you decide.
I needed to do this for arbitrary file sizes of a gigabyte or more containing string data. The process is conceptually simple:
1. Use the root process (proc 0) to get the file size.
2. MPI_Bcast this file size to all procs.
3. Initialize and size a char array to contain the file data on all procs.
4. Use proc 0 to read the data file.
5. MPI_Bcast the file data to all procs.
Here is a prototype.
Save: testmpi.cpp.
Compile: mpic++ -o t testmpi.cpp.
Run: mpirun -np 8 t yourbigfile.txt
-------
#include <algorithm>
#include <fstream>
#include <mpi.h>
#include <stdio.h>
#include <string.h>
using namespace std;
//determine the size of a file
std::ifstream::pos_type filesize(const char* filename)
{
std::ifstream in(filename, std::ifstream::ate | std::ifstream::binary);
return in.tellg();
}
//quickly reads a large .dat file into a memory buffer
char * MyBigRead(char* DatFilePath)
{
FILE * pFile;
unsigned long long lSize;
char * buffer;
size_t result;
pFile = fopen ( DatFilePath , "r" );
if (pFile==NULL) {fputs ("File error",stderr); exit (1);}
// obtain file size:
fseek (pFile , 0 , SEEK_END);
lSize = ftell (pFile);
rewind (pFile);
// allocate memory to contain the whole file:
buffer = (char*) malloc (sizeof(char)*lSize);
if (buffer == NULL) {fputs ("Memory error",stderr); exit (2);}
// copy the file into the buffer:
result = fread (buffer,1,lSize,pFile);
if (result != lSize) {fputs ("Reading error",stderr); exit (3);}
fclose (pFile);
return buffer;
}
int main(int argc, char* argv[]) {
MPI::Init ();
int procid = MPI::COMM_WORLD.Get_rank ( ); //Get the individual process ID.
int nprocs = MPI::COMM_WORLD.Get_size ( );
unsigned long long f = 0; //dat file size
//get the size of the dat file using proc 0
if (procid == 0)
{
f = (unsigned long long)filesize(argv[1]);
f = f + 1; //increase by 1 to accomodate \0 string terminator
}
//broadcast file size to all procs
MPI_Bcast(&f, 1, MPI_UNSIGNED_LONG_LONG, 0, MPI_COMM_WORLD);
//initialize and size the data structure to hold contents of dat file
char * DatFileBuffer = (char*)malloc(f);
//report before MPI_Bcast'ing the data
printf("[proc%d]Before: snippet of DatFileBuffer:>%.5s<, size:%d\n", procid, DatFileBuffer, f);
//read the dat file from disk using proc 0
if (procid == 0)
{
char * d = MyBigRead(argv[1]);
//convert the char* to a char array
strcpy(DatFileBuffer, d); //copy data read into char * d to the pre-sized DatFileBuffer
}
//broadcast the dat file contents to all procs
MPI_Bcast(&DatFileBuffer[0], f, MPI_CHAR, 0, MPI_COMM_WORLD);
//report after MPI_Bcast'ing the data
printf("[proc%d]After: snippet of DatFileBuffer:>%.10s<, size:%d\n", procid, DatFileBuffer, f);
MPI_Finalize();
return 0;
}
-------
No comments:
Post a Comment