Tech Notes: May 2014

Row Wise Read:

private static void readRowWise(RCFile.Reader rcReader) {
int rowcounter = 0;
Text len = rcReader.getMetadata().get(new Text("hive.io.rcfile.column.number"));
int numberOfColumns = Integer.valueOf(len.toString());

try {
while (rcReader.next(new LongWritable(rowcounter))) {
BytesRefArrayWritable cols = new BytesRefArrayWritable();

/** * Have to call 'resetValid' for all rows to allocate how many columns for each row. * This looks ugly. But this is the way to make the row wise reading working. */ cols.resetValid(numberOfColumns);

/**
* The name of getCurrentRow is kind of misleading. It actually reads all rows in the current row group,
* column by column (due to the file format nature of RCFile) and store them internally so next call to getCurrentRow
* will actually return the same data buffer. By default, it sets 'valid' variable to number of columns so only the columns
* for first row can be gotten by calling cols.get(i).
*
* Once first row is read, a call to 'resetValid' will allow us to read next row. The value passed to 'resetValid'
* have to be the number of columns to allow read all columns for next row.
*/
rcReader.getCurrentRow(cols);

int size = cols.size(); // this actually returns the number of columns in the current row.

for (int i= 0; i<size; i++) {
BytesRefWritable currentColumn = cols.get(i);

byte[] currentColumnBytes = currentColumn.getBytesCopy(); // get current column data for the current row
Text text = new Text(currentColumnBytes);
System.out.println("columnText="+text.toString());
}
rowcounter++;
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

Column Wise Read:

private static void readColumnWise(RCFile.Reader rcReader) {

Text len = rcReader.getMetadata().get(new Text("hive.io.rcfile.column.number"));

int numberOfColumns = Integer.valueOf(len.toString());

String[][] firstNRows = null;

int numberOfRowsNeeded = 10; // only looking at first 10 rows

try {

// go through each row group

while (rcReader.nextColumnsBatch()) {

// go through each column in current row group

for (int i=0; i<numberOfColumns; i++) {

BytesRefArrayWritable columnData = rcReader.getColumn(i, null);

if (firstNRows==null)

firstNRows = new String[Math.min(numberOfRowsNeeded,columnData.size())][numberOfColumns];

// for a given column, go through each row in current row group

for (int j=0; j<columnData.size() && j<numberOfRowsNeeded; j++) {

BytesRefWritable cellData = columnData.get(j);

byte[] currentCell = Arrays.copyOfRange(cellData.getData(), cellData.getStart(), cellData.getStart()+cellData.getLength());

Text currentCellStr = new Text(currentCell);

System.out.println("columnText="+currentCellStr);

firstNRows[j][i] = currentCellStr.toString();

}

} catch (IOException e1) {

// TODO Auto-generated catch block

e1.printStackTrace();

}

// transfer the matrix to row based from column based

for (int i=0; i<numberOfRowsNeeded; i++) {

for (int j=0; j<numberOfColumns; j++) {

if (j>0) System.out.print(",");

System.out.print(firstNRows[i][j]);

}

System.out.println();

}

}

A Test Driver:

private static void testDirectRead(boolean rowWise) {
Configuration conf = new Configuration();
conf.addResource(new Path("C:\\etc\\Hadoop\\conf\\core-site.xml"));
conf.addResource(new Path("C:\\etc\\Hadoop\\conf\\hdfs-site.xml"));
conf.addResource(new Path("C:\\etc\\Hadoop\\conf\\mapred-site.xml"));

FileSystem fs = null;
try {
fs = FileSystem.get(conf);
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}

RCFile.Reader rcReader = null;
try {
rcReader = new RCFile.Reader(fs, new Path("/user/hive/warehouse/rc_userdatatest2/000000_0"), conf);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

if (rowWise)
readRowWise(rcReader);
else
readColumnWise(rcReader);

rcReader.close();

}

Tech Notes

Friday, May 30, 2014

Eclipse: search files outside workspace

Friday, May 23, 2014

Row wise read vs column wise read for RCFile