private static void readRowWise(RCFile.Reader rcReader) {
int rowcounter = 0;
Text len = rcReader.getMetadata().get(new Text("hive.io.rcfile.column.number"));
int numberOfColumns = Integer.valueOf(len.toString());
try {
while (rcReader.next(new LongWritable(rowcounter))) {
BytesRefArrayWritable cols = new BytesRefArrayWritable();
/** * Have to call 'resetValid' for all rows to allocate how many columns for each row. * This looks ugly. But this is the way to make the row wise reading working. */ cols.resetValid(numberOfColumns);
/**
* The name of getCurrentRow is kind of misleading. It actually reads all rows in the current row group,
* column by column (due to the file format nature of RCFile) and store them internally so next call to getCurrentRow
* will actually return the same data buffer. By default, it sets 'valid' variable to number of columns so only the columns
* for first row can be gotten by calling cols.get(i).
*
* Once first row is read, a call to 'resetValid' will allow us to read next row. The value passed to 'resetValid'
* have to be the number of columns to allow read all columns for next row.
*/
rcReader.getCurrentRow(cols);
int size = cols.size(); // this actually returns the number of columns in the current row.
for (int i= 0; i<size; i++) {
BytesRefWritable currentColumn = cols.get(i);
byte[] currentColumnBytes = currentColumn.getBytesCopy(); // get current column data for the current row
Text text = new Text(currentColumnBytes);
System.out.println("columnText="+text.toString());
}
rowcounter++;
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
Column Wise Read:
private static void readColumnWise(RCFile.Reader rcReader) {
 Text len = rcReader.getMetadata().get(new Text("hive.io.rcfile.column.number"));
 int numberOfColumns = Integer.valueOf(len.toString());
 String[][] firstNRows = null;
 int numberOfRowsNeeded = 10;  // only looking at first 10 rows
 try {
  // go through each row group
  while (rcReader.nextColumnsBatch()) {
   // go through each column in current row group
   for (int i=0; i<numberOfColumns; i++) {
    BytesRefArrayWritable columnData = rcReader.getColumn(i, null);
    if (firstNRows==null)
     firstNRows = new String[Math.min(numberOfRowsNeeded,columnData.size())][numberOfColumns];
    // for a given column, go through each row in current row group
    for (int j=0; j<columnData.size() && j<numberOfRowsNeeded; j++) {
     BytesRefWritable cellData = columnData.get(j);
     byte[] currentCell = Arrays.copyOfRange(cellData.getData(), cellData.getStart(), cellData.getStart()+cellData.getLength());
     Text currentCellStr = new Text(currentCell);
     System.out.println("columnText="+currentCellStr);
     firstNRows[j][i] = currentCellStr.toString();
    }
   }
  }
 } catch (IOException e1) {
  // TODO Auto-generated catch block
  e1.printStackTrace();
 }
 // transfer the matrix to row based from column based
 for (int i=0; i<numberOfRowsNeeded; i++) {
  for (int j=0; j<numberOfColumns; j++) {
   if (j>0) System.out.print(",");
   System.out.print(firstNRows[i][j]);
  }
  System.out.println();
 }
}
A Test Driver:
private static void testDirectRead(boolean rowWise) {
Configuration conf = new Configuration();
conf.addResource(new Path("C:\\etc\\Hadoop\\conf\\core-site.xml"));
conf.addResource(new Path("C:\\etc\\Hadoop\\conf\\hdfs-site.xml"));
conf.addResource(new Path("C:\\etc\\Hadoop\\conf\\mapred-site.xml"));
FileSystem fs = null;
try {
fs = FileSystem.get(conf);
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
  
RCFile.Reader rcReader = null;
try {
rcReader = new RCFile.Reader(fs, new Path("/user/hive/warehouse/rc_userdatatest2/000000_0"), conf);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if (rowWise)
readRowWise(rcReader);
else
readColumnWise(rcReader);
  
rcReader.close();
A Test Driver:
private static void testDirectRead(boolean rowWise) {
Configuration conf = new Configuration();
conf.addResource(new Path("C:\\etc\\Hadoop\\conf\\core-site.xml"));
conf.addResource(new Path("C:\\etc\\Hadoop\\conf\\hdfs-site.xml"));
conf.addResource(new Path("C:\\etc\\Hadoop\\conf\\mapred-site.xml"));
FileSystem fs = null;
try {
fs = FileSystem.get(conf);
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
RCFile.Reader rcReader = null;
try {
rcReader = new RCFile.Reader(fs, new Path("/user/hive/warehouse/rc_userdatatest2/000000_0"), conf);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if (rowWise)
readRowWise(rcReader);
else
readColumnWise(rcReader);
rcReader.close();
}
 
No comments:
Post a Comment