Your project can be on or off. Your project's priority can be changed. Your job can be changed. But the technology is always heading north!
Friday, May 30, 2014
Eclipse: search files outside workspace
I used to use Visual Studio and IntelliJ IDEA for coding. When switching to use Eclipse, i found the search is very workspace oriented. I can not search something outside of my project.
Finally, i found this useful link to solve this problem:
http://eclipse.dzone.com/articles/5-best-eclipse-plugins-system
Friday, May 23, 2014
Row wise read vs column wise read for RCFile
Row Wise Read:
private static void readRowWise(RCFile.Reader rcReader) {
int rowcounter = 0;
Text len = rcReader.getMetadata().get(new Text("hive.io.rcfile.column.number"));
int numberOfColumns = Integer.valueOf(len.toString());
try {
while (rcReader.next(new LongWritable(rowcounter))) {
BytesRefArrayWritable cols = new BytesRefArrayWritable();
/** * Have to call 'resetValid' for all rows to allocate how many columns for each row. * This looks ugly. But this is the way to make the row wise reading working. */ cols.resetValid(numberOfColumns);
/**
* The name of getCurrentRow is kind of misleading. It actually reads all rows in the current row group,
* column by column (due to the file format nature of RCFile) and store them internally so next call to getCurrentRow
* will actually return the same data buffer. By default, it sets 'valid' variable to number of columns so only the columns
* for first row can be gotten by calling cols.get(i).
*
* Once first row is read, a call to 'resetValid' will allow us to read next row. The value passed to 'resetValid'
* have to be the number of columns to allow read all columns for next row.
*/
rcReader.getCurrentRow(cols);
int size = cols.size(); // this actually returns the number of columns in the current row.
for (int i= 0; i<size; i++) {
BytesRefWritable currentColumn = cols.get(i);
byte[] currentColumnBytes = currentColumn.getBytesCopy(); // get current column data for the current row
Text text = new Text(currentColumnBytes);
System.out.println("columnText="+text.toString());
}
rowcounter++;
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private static void readRowWise(RCFile.Reader rcReader) {
int rowcounter = 0;
Text len = rcReader.getMetadata().get(new Text("hive.io.rcfile.column.number"));
int numberOfColumns = Integer.valueOf(len.toString());
try {
while (rcReader.next(new LongWritable(rowcounter))) {
BytesRefArrayWritable cols = new BytesRefArrayWritable();
/** * Have to call 'resetValid' for all rows to allocate how many columns for each row. * This looks ugly. But this is the way to make the row wise reading working. */ cols.resetValid(numberOfColumns);
/**
* The name of getCurrentRow is kind of misleading. It actually reads all rows in the current row group,
* column by column (due to the file format nature of RCFile) and store them internally so next call to getCurrentRow
* will actually return the same data buffer. By default, it sets 'valid' variable to number of columns so only the columns
* for first row can be gotten by calling cols.get(i).
*
* Once first row is read, a call to 'resetValid' will allow us to read next row. The value passed to 'resetValid'
* have to be the number of columns to allow read all columns for next row.
*/
rcReader.getCurrentRow(cols);
int size = cols.size(); // this actually returns the number of columns in the current row.
for (int i= 0; i<size; i++) {
BytesRefWritable currentColumn = cols.get(i);
byte[] currentColumnBytes = currentColumn.getBytesCopy(); // get current column data for the current row
Text text = new Text(currentColumnBytes);
System.out.println("columnText="+text.toString());
}
rowcounter++;
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
Column Wise Read:
private static void readColumnWise(RCFile.Reader rcReader) {
Text len = rcReader.getMetadata().get(new Text("hive.io.rcfile.column.number"));
int numberOfColumns = Integer.valueOf(len.toString());
String[][] firstNRows = null;
int numberOfRowsNeeded = 10; // only looking at first 10 rows
try {
// go through each row group
while (rcReader.nextColumnsBatch()) {
// go through each column in current row group
for (int i=0; i<numberOfColumns; i++) {
BytesRefArrayWritable columnData = rcReader.getColumn(i, null);
if (firstNRows==null)
firstNRows = new String[Math.min(numberOfRowsNeeded,columnData.size())][numberOfColumns];
// for a given column, go through each row in current row group
for (int j=0; j<columnData.size() && j<numberOfRowsNeeded; j++) {
BytesRefWritable cellData = columnData.get(j);
byte[] currentCell = Arrays.copyOfRange(cellData.getData(), cellData.getStart(), cellData.getStart()+cellData.getLength());
Text currentCellStr = new Text(currentCell);
System.out.println("columnText="+currentCellStr);
firstNRows[j][i] = currentCellStr.toString();
}
}
}
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
// transfer the matrix to row based from column based
for (int i=0; i<numberOfRowsNeeded; i++) {
for (int j=0; j<numberOfColumns; j++) {
if (j>0) System.out.print(",");
System.out.print(firstNRows[i][j]);
}
System.out.println();
}
}
A Test Driver:
private static void testDirectRead(boolean rowWise) {
Configuration conf = new Configuration();
conf.addResource(new Path("C:\\etc\\Hadoop\\conf\\core-site.xml"));
conf.addResource(new Path("C:\\etc\\Hadoop\\conf\\hdfs-site.xml"));
conf.addResource(new Path("C:\\etc\\Hadoop\\conf\\mapred-site.xml"));
FileSystem fs = null;
try {
fs = FileSystem.get(conf);
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
RCFile.Reader rcReader = null;
try {
rcReader = new RCFile.Reader(fs, new Path("/user/hive/warehouse/rc_userdatatest2/000000_0"), conf);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if (rowWise)
readRowWise(rcReader);
else
readColumnWise(rcReader);
rcReader.close();
A Test Driver:
private static void testDirectRead(boolean rowWise) {
Configuration conf = new Configuration();
conf.addResource(new Path("C:\\etc\\Hadoop\\conf\\core-site.xml"));
conf.addResource(new Path("C:\\etc\\Hadoop\\conf\\hdfs-site.xml"));
conf.addResource(new Path("C:\\etc\\Hadoop\\conf\\mapred-site.xml"));
FileSystem fs = null;
try {
fs = FileSystem.get(conf);
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
RCFile.Reader rcReader = null;
try {
rcReader = new RCFile.Reader(fs, new Path("/user/hive/warehouse/rc_userdatatest2/000000_0"), conf);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if (rowWise)
readRowWise(rcReader);
else
readColumnWise(rcReader);
rcReader.close();
}
Subscribe to:
Posts (Atom)