Dear MRtrix gurus,
I am relatively new to the MRtrix toolset, and am mostly interested in whole-brain tractography. I really that tckgen
outputs the streamlines themselves in a format that is very easy to parse (minus the hassle of the endianness).
However, a tractography will typically consider millions of streamlines, resulting in relatively large .tck
files – commonly over several GB. Writing scripts to process such large amounts of data anywhere else than on a cluster will require some planning, in order to avoid filling up the virtual memory and crashing the host system. Carelessly written scripts run on clusters will consume unnecessarily large amounts of memory at the expense of everyone else.
One piece of information that I found myself missing was an easy way to map the byte-offsets and vertex-counts of streamlines within a .tck
file. This does not directly solve the issue of working with large .tck
files, but makes it easier to load arbitrary subsets of streamlines into memory. Of course, one still has to be clever about how to compute and combine intermediary results, but this will always be the case.
I put together a demo-code in C++ which does this mapping. It’s not particularly clean or heavily optimised, but it runs fairly fast on my laptop (~0.93 sec on average to parse 200k streamlines). I thought I would put it out there so people can use it, and potentially improve it. I’d really appreciate feedback if you find it and/or the idea useful
Cheers!
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cctype>
#include <cmath>
#include <cstdint>
#include <utility>
#include <stdexcept>
#include <exception>
#include <string>
#include <unordered_map>
#include <vector>
// maximum length of a metadata line
#define META_MAXLEN 2047
// type used to store metadata field+value pairs
using meta_t = std::unordered_map< std::string, std::string >;
// ------------------------------------------------------------------------
// remove whitespace from both ends of a string
std::string strtrim( const char *str ) {
std::string out;
// null input
if ( str == NULL ) return out;
// remove leading whitespace
while (isspace( (unsigned char) *str )) ++str;
// all whitespace?
if ( *str == 0 ) return out;
// trim trailing whitespace
const char *end = str + strlen(str)-1;
while (isspace( (unsigned char) *end )) --end;
// copy resulting string
return out.assign( str, end-str+1 );
}
// insert new metadata field into map
void parse_field( FILE *pFile, char *buf, meta_t &metadata ) {
// read line
if ( fgets( buf, META_MAXLEN, pFile ) == NULL )
throw std::runtime_error("Could not read meta line");
// split name and value
char *tok = strtok( buf, ":" );
if ( tok == NULL ) throw std::runtime_error("Failed to read metadata");
metadata.emplace( strtrim(tok), strtrim(strtok( NULL, "\n" )) );
}
// parse number at the end of field "file"
long int parse_offset( const std::string& val ) {
const char* num = val.c_str() + val.length()-1;
while (isdigit( (unsigned char) *num )) --num;
return atoi(num+1);
}
// test whether input string starts with given prefix
bool startswith( const std::string& str, const char *pfx ) {
if ( pfx == NULL ) throw std::invalid_argument("Null prefix");
int n = strlen(pfx), L = str.length();
const char *x = str.c_str();
if ( L < n ) return false;
while( n > 0 && *x == *pfx ) ++pfx, ++x, --n;
return n == 0;
}
// test whether input string ends with given suffix
bool endswith( const std::string& str, const char *sfx ) {
if ( sfx == NULL ) throw std::invalid_argument("Null suffix");
int n = strlen(sfx), L = str.length();
const char *x = str.c_str() + L-n;
if ( L < n ) return false;
while( n > 0 && *x == *sfx ) ++sfx, ++x, --n;
return n == 0;
}
// ------------------------------------------------------------------------
// see: https://stackoverflow.com/a/1001328/472610
bool is_big_endian()
{
uint16_t x = 1;
return *(uint8_t*) &x != 1;
}
// see: https://stackoverflow.com/a/105342/472610
void swap_byte_order( uint32_t& x )
{
x = (x >> 24) |
((x<<8) & 0x00FF0000) |
((x>>8) & 0x0000FF00) |
(x << 24);
}
// ------------------------------------------------------------------------
struct Streamline {
uint32_t pos;
uint16_t len;
};
void usage( const char *pname ) {
printf("\nUSAGE:\n");
printf(" %s <InputFile.tck> <OutputFile.bin>\n\n", pname);
printf("Parse input .tck file, and write byte-offset and vertex-count of streamlines to output binary file.\n");
printf("The output file is formatted as successive pairs of 32 and 16 bits integers.\n");
printf("Hence, one can find the offset and count of the K^th streamline with:\n");
printf(" fseek( bin_file, K * 6, SEEK_SET );\n");
printf(" fread( &streamline, 6, 1, bin_file );\n\n");
printf("Where 'streamline' is of type: struct Streamline { uint32_t offset; uint16_t count; };\n\n");
}
int main( int argc, char *argv[] )
{
if ( argc != 3 ) {
usage( argv[0] );
throw std::invalid_argument( "Not enough inputs." );
}
char metabuf[META_MAXLEN];
std::vector<Streamline> streamlines;
meta_t metadata;
bool doswap;
long int offset;
uint32_t vertex[3];
// open files
FILE *Ftck = fopen( argv[1], "rb" );
FILE *Fout = fopen( argv[2], "wb+" );
if ( Ftck == NULL ) throw std::runtime_error( "Could not open .tck file." );
if ( Fout == NULL ) {
fclose(Ftck);
throw std::runtime_error( "Could not open or create output file." );
}
try {
// read "mrtrix tracks"
if ( fgets( metabuf, META_MAXLEN, Ftck ) == NULL )
throw std::runtime_error("Could not read .tck file.");
if ( strtrim(metabuf) != "mrtrix tracks" )
throw std::runtime_error("Failed to identify magic string.");
// parse metadata
while ( metadata.find("END") == metadata.end() )
parse_field( Ftck, metabuf, metadata );
// check fields of interest
offset = parse_offset( metadata["file"] ); // initial offset
if ( ! startswith(metadata["datatype"],"Float32") )
throw std::runtime_error("This implementation only works with Float32 types.");
// determine whether or not byteswap is needed
if ( is_big_endian() )
doswap = endswith( metadata["datatype"], "LE" );
else
doswap = endswith( metadata["datatype"], "BE" );
// prepare iteration
fseek( Ftck, offset, SEEK_SET ); // move to initial offset
streamlines.reserve( atoi(metadata["count"].c_str()) ); // allocate storage for streamlines
// iterate over streamlines
Streamline cur;
cur.pos = offset;
cur.len = 0;
while ( true ) {
if ( fread( vertex, 4, 3, Ftck ) != 3 )
throw std::runtime_error( "Number of vertices is not a multiple of 3." );
if ( doswap ) swap_byte_order( *vertex );
// check for end of file or end of streamline
if ( std::isinf( *(float*) vertex ) ) break;
if ( std::isnan( *(float*) vertex ) ) {
streamlines.push_back(cur);
// fprintf( Fout, "%u %hu\n", cur.pos, cur.len );
fwrite( (void*) &cur, 6, 1, Fout );
cur.pos = ftell( Ftck );
cur.len = 0;
}
else ++cur.len;
}
}
catch ( std::exception& e ) {
fprintf( stderr, "AN ERROR OCCURRED:\n\n" );
fprintf( stderr, "%s\n", e.what() );
}
fclose(Ftck);
fclose(Fout);
// print summary
printf( "Found %lu streamlines.\n", streamlines.size() );
// potentially do something with streamlines vector?
}
To be saved in a file tck-offsets.cpp
and compiled e.g. with:
clang++ tck-offsets.cpp -std=c++11 -o tck-offfsets
./tck-offsets Tracks.tck output.bin
NOTE: one potential limitation of this implementation is that it only checks of Inf
or NaN
values (to determine end of file/streamline) for the first coordinate of each vertex. This can easily be improved to check all three coordinates if needed.