Friday, May 30, 2014

Eclipse: search files outside workspace


I used to use Visual Studio and IntelliJ IDEA for coding.  When switching to use Eclipse, i found the search is very workspace oriented.  I can not search something outside of my project.

Finally, i found this useful link to solve this problem:

http://eclipse.dzone.com/articles/5-best-eclipse-plugins-system

Friday, May 23, 2014

Row wise read vs column wise read for RCFile

Row Wise Read:

private static void readRowWise(RCFile.Reader rcReader) {
int rowcounter = 0;
Text len = rcReader.getMetadata().get(new Text("hive.io.rcfile.column.number"));
int numberOfColumns = Integer.valueOf(len.toString());

try {
while (rcReader.next(new LongWritable(rowcounter))) {
BytesRefArrayWritable cols = new BytesRefArrayWritable();

  /** * Have to call 'resetValid' for all rows to allocate how many columns for each row. * This looks ugly. But this is the way to make the row wise reading working. */  cols.resetValid(numberOfColumns);

/**
* The name of getCurrentRow is kind of misleading.  It actually reads all rows in the current row group,
* column by column (due to the file format nature of RCFile) and store them internally so next call to getCurrentRow
* will actually return the same data buffer. By default, it sets 'valid' variable to number of columns so only the columns
* for first row can be gotten by calling cols.get(i).
*
* Once first row is read, a call to 'resetValid' will allow us to read next row.  The value passed to 'resetValid'
* have to be the number of columns to allow read all columns for next row.
*/
rcReader.getCurrentRow(cols);

int size = cols.size();  // this actually returns the number of columns in the current row.

for (int i= 0; i<size; i++) {
BytesRefWritable currentColumn = cols.get(i);

byte[] currentColumnBytes = currentColumn.getBytesCopy();  // get current column data for the current row
Text text = new Text(currentColumnBytes);
System.out.println("columnText="+text.toString());
}
rowcounter++;
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}


Column Wise Read:

private static void readColumnWise(RCFile.Reader rcReader) {
Text len = rcReader.getMetadata().get(new Text("hive.io.rcfile.column.number"));
int numberOfColumns = Integer.valueOf(len.toString());
String[][] firstNRows = null;
int numberOfRowsNeeded = 10;  // only looking at first 10 rows
try {
// go through each row group
while (rcReader.nextColumnsBatch()) {
// go through each column in current row group
for (int i=0; i<numberOfColumns; i++) {
BytesRefArrayWritable columnData = rcReader.getColumn(i, null);
if (firstNRows==null)
firstNRows = new String[Math.min(numberOfRowsNeeded,columnData.size())][numberOfColumns];
// for a given column, go through each row in current row group
for (int j=0; j<columnData.size() && j<numberOfRowsNeeded; j++) {
BytesRefWritable cellData = columnData.get(j);
byte[] currentCell = Arrays.copyOfRange(cellData.getData(), cellData.getStart(), cellData.getStart()+cellData.getLength());
Text currentCellStr = new Text(currentCell);
System.out.println("columnText="+currentCellStr);
firstNRows[j][i] = currentCellStr.toString();
}
}
}
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
// transfer the matrix to row based from column based
for (int i=0; i<numberOfRowsNeeded; i++) {
for (int j=0; j<numberOfColumns; j++) {
if (j>0) System.out.print(",");
System.out.print(firstNRows[i][j]);
}
System.out.println();
}
}

A Test Driver:

private static void testDirectRead(boolean rowWise) {
Configuration conf = new Configuration();
conf.addResource(new Path("C:\\etc\\Hadoop\\conf\\core-site.xml"));
conf.addResource(new Path("C:\\etc\\Hadoop\\conf\\hdfs-site.xml"));
conf.addResource(new Path("C:\\etc\\Hadoop\\conf\\mapred-site.xml"));

FileSystem fs = null;
try {
fs = FileSystem.get(conf);
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}

RCFile.Reader rcReader = null;
try {
rcReader = new RCFile.Reader(fs, new Path("/user/hive/warehouse/rc_userdatatest2/000000_0"), conf);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

if (rowWise)
readRowWise(rcReader);
else
readColumnWise(rcReader);

rcReader.close();
}