hadoop fsck / hadoop fsck -delete / hadoop fsck -move /-move option moves under /lost+found
-delete option deleted all corrupted files
For more options: http://developer.yahoo.com/hadoop/tutorial/module2.html
hadoop fsck / hadoop fsck -delete / hadoop fsck -move /-move option moves under /lost+found
hg addremove directory/file --similarity 90If only want to detect files with no change, then 100 should be used. So the number represents the percentage.
Exception running child : org.apache.hadoop.util.DiskChecker$DiskErrorException: Could not find any valid local directory for output/spill0.out
job.setNumReduceTasks(0);
public class MultiFileRawWebGraphRecordReader extends RecordReader < Text, Text > { private static final Log LOG = LogFactory.getLog(MultiFileRawWebGraphRecordReader.class); private CombineFileSplit split; private TaskAttemptContext context; private int index; private RecordReader< Text, Text > rr; public MultiFileRawWebGraphRecordReader(CombineFileSplit split, TaskAttemptContext context, Integer index) throws IOException { this.split = split; this.context = context; this.index = index; rr = new RawWebGraphRecordReader(); } public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { this.split = (CombineFileSplit) genericSplit; this.context = context; if (null == rr) { rr = new RawWebGraphRecordReader(); } FileSplit fileSplit = new FileSplit(this.split.getPath(index), this.split.getOffset(index), this.split.getLength(index), this.split.getLocations()); this.rr.initialize(fileSplit, this.context); } public boolean nextKeyValue() throws IOException, InterruptedException { return rr.nextKeyValue(); } @Override public Text getCurrentKey() throws IOException, InterruptedException { return rr.getCurrentKey(); } @Override public Text getCurrentValue() throws IOException, InterruptedException { return rr.getCurrentValue(); } /** * Get the progress within the split * @throws InterruptedException * @throws IOException */ @Override public float getProgress() throws IOException, InterruptedException { return rr.getProgress(); } public synchronized void close() throws IOException { if (rr != null) { rr.close(); rr = null; } }
public static class RawGraphInputFormat extends CombineFileInputFormat< Text, Text > { @Override public RecordReader< Text, Text > createRecordReader(InputSplit split, TaskAttemptContext context)throws IOException { return new CombineFileRecordReader< Text, Text >( (CombineFileSplit) split, context, MultiFileRawWebGraphRecordReader.class); } @Override protected boolean isSplitable(JobContext context, Path file) { CompressionCodec codec = new CompressionCodecFactory( context.getConfiguration()).getCodec(file); return codec == null; } }
mapred.map.child.java.opts
: heap size for map tasksmapred.reduce.child.java.opts
: heap size for reduce tasksmapred.tasktracker.map.tasks.maximum
: max map tasks can run simultaneously per nodemapred.tasktracker.reduce.tasks.maximum
: max reduce tasks can run simultaneously per node io.sort.factor
: max # of streams to merge at once for sorting. Used both in map and reduce. io.sort.mb
: map side memory buffer size used while sorting mapred.job.shuffle.input.buffer.percent
: Reduce side buffer related - The percentage of memory to be allocated from the maximum heap size for storing map outputs during the shufflefs.inmemory.size.mb
is very bad idea!public static class UrlReducer extends Reducer{ IntWritable sumw = new IntWritable(); int sum; public void reduce(Text key,Iterable<IntW> vals,Context context){ sum=0; for (IntWritable val : vals) { sum += val.get(); } sumw.set(sum); context.write(key, sumw); } }
hg rm -Af
hg forget