hadoop fsck / hadoop fsck -delete / hadoop fsck -move /-move option moves under /lost+found
-delete option deleted all corrupted files
For more options: http://developer.yahoo.com/hadoop/tutorial/module2.html
hadoop fsck / hadoop fsck -delete / hadoop fsck -move /-move option moves under /lost+found
hg addremove directory/file --similarity 90If only want to detect files with no change, then 100 should be used. So the number represents the percentage.
Exception running child : org.apache.hadoop.util.DiskChecker$DiskErrorException: Could not find any valid local directory for output/spill0.out
job.setNumReduceTasks(0);
public class MultiFileRawWebGraphRecordReader extends
RecordReader < Text, Text > {
private static final Log LOG = LogFactory.getLog(MultiFileRawWebGraphRecordReader.class);
private CombineFileSplit split;
private TaskAttemptContext context;
private int index;
private RecordReader< Text, Text > rr;
public MultiFileRawWebGraphRecordReader(CombineFileSplit split,
TaskAttemptContext context,
Integer index) throws IOException {
this.split = split;
this.context = context;
this.index = index;
rr = new RawWebGraphRecordReader();
}
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
throws IOException, InterruptedException {
this.split = (CombineFileSplit) genericSplit;
this.context = context;
if (null == rr) {
rr = new RawWebGraphRecordReader();
}
FileSplit fileSplit = new FileSplit(this.split.getPath(index),
this.split.getOffset(index),
this.split.getLength(index),
this.split.getLocations());
this.rr.initialize(fileSplit, this.context);
}
public boolean nextKeyValue() throws IOException, InterruptedException {
return rr.nextKeyValue();
}
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
return rr.getCurrentKey();
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
return rr.getCurrentValue();
}
/**
* Get the progress within the split
* @throws InterruptedException
* @throws IOException
*/
@Override
public float getProgress() throws IOException, InterruptedException {
return rr.getProgress();
}
public synchronized void close() throws IOException {
if (rr != null) {
rr.close();
rr = null;
}
}
public static class RawGraphInputFormat extends
CombineFileInputFormat< Text, Text > {
@Override
public RecordReader< Text, Text >
createRecordReader(InputSplit split,
TaskAttemptContext context)throws IOException {
return new CombineFileRecordReader< Text, Text >(
(CombineFileSplit) split,
context,
MultiFileRawWebGraphRecordReader.class);
}
@Override
protected boolean isSplitable(JobContext context, Path file) {
CompressionCodec codec = new CompressionCodecFactory(
context.getConfiguration()).getCodec(file);
return codec == null;
}
}
mapred.map.child.java.opts : heap size for map tasksmapred.reduce.child.java.opts: heap size for reduce tasksmapred.tasktracker.map.tasks.maximum: max map tasks can run simultaneously per nodemapred.tasktracker.reduce.tasks.maximum: max reduce tasks can run simultaneously per node io.sort.factor: max # of streams to merge at once for sorting. Used both in map and reduce. io.sort.mb: map side memory buffer size used while sorting mapred.job.shuffle.input.buffer.percent: Reduce side buffer related - The percentage of memory to be allocated from the maximum heap size for storing map outputs during the shufflefs.inmemory.size.mb is very bad idea!public static class UrlReducer extends Reducer{
IntWritable sumw = new IntWritable();
int sum;
public void reduce(Text key,Iterable<IntW> vals,Context context){
sum=0;
for (IntWritable val : vals) {
sum += val.get();
}
sumw.set(sum);
context.write(key, sumw);
}
}
hg rm -Af
hg forget