版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/wcandy001/article/details/49668391
package org.apache.nutch.crawl;
import java.io.*;
import java.text.SimpleDateFormat;
import java.util.*;
// Commons Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
//这是个MapReduce程序,crawldb继承Configured实现Tool的run方法
public class CrawlDb extends Configured implements Tool {
//下面是日志和一些常量
public static final Logger LOG = LoggerFactory.getLogger(CrawlDb.class);
public static final String CRAWLDB_ADDITIONS_ALLOWED = "db.update.additions.allowed";
public static final String CRAWLDB_PURGE_404 = "db.update.purge.404";
public static final String CURRENT_NAME = "current";
public static final String LOCK_NAME = ".locked";
//空的构造函数
public CrawlDb() {}
//带参数的构造函数,参数是配置文件
public CrawlDb(Configuration conf) {
setConf(conf);
}
//这个方法,查看crawldb和segments的路径是否存在这两个文件
public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter) throws IOException {
boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
update(crawlDb, segments, normalize, filter, additionsAllowed, false);
}
public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter, boolean additionsAllowed, boolean force) throws IOException {
//根据配置信息(执行crawldb时后面的参数)创建一个文件系统
FileSystem fs = FileSystem.get(getConf());
//创建crawldb文件路径
Path lock = new Path(crawlDb, LOCK_NAME);
//这个类主要是判断crawldb文件能不能创建的,看一下这个类的方法
// public static void createLockFile(FileSystem fs, Path lockFile, boolean accept) throws IOException {
//因为输入的时候第二个参数是crawldb的目录位置,还因为ntuch的容错功能,所以他判断你这输入的是不是个文件,在判断这目录是不是有crawldb这个文件
//不满足条件直接退出,IO异常
// if (fs.exists(lockFile)) {
// if(!accept)
// throw new IOException("lock file " + lockFile + " already exists.");
// if (fs.getFileStatus(lockFile).isDir())
// throw new IOException("lock file " + lockFile + " already exists and is a directory.");
// // do nothing - the file already exists.
// } else {
//到这,你输入的crawldb路径是个目录,同时目录下面没有crawldb这个目录,那么你可以创建了
// // make sure parents exist
// fs.mkdirs(lockFile.getParent());
// fs.createNewFile(lockFile);
// }
LockUtil.createLockFile(fs, lock, force);
//这是定了一下时间格式,因为他要输出信息,所以需要在输出信息的时候对时间有要求一起输出
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
//这就获取了系统的当前时间了,也就是创建crawldb的时间
long start = System.currentTimeMillis();
//驱动,他先是把创建crawldb这个目录的作业创建了
JobConf job = CrawlDb.createJob(getConf(), crawlDb);
//或许你不知道这下面几句是啥东西,那么看一下你就so easy了
// public class CrawlDbFilter implements Mapper<Text, CrawlDatum, Text, CrawlDatum> {
// public static final String URL_FILTERING = "crawldb.url.filters";
//
// public static final String URL_NORMALIZING = "crawldb.url.normalizers";
//
// public static final String URL_NORMALIZING_SCOPE = "crawldb.url.normalizers.scope";
//这个CrawlDbFilter是个map,总的来说这地方的功能就是咱们运行crawldb * * * *以后在控制台上看见的那些输出(当然你要是IOException了那就看不到了)
job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed);
job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);
job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize);
//这主要是文件更新的作用,因为nutch的核心就是他们把抓取结果存起来(俗称更新),通过更新他可以知道哪些他爬过了,哪些是下一次需要爬的等,当然这才启动哪有记录啊,直接false过去了
boolean url404Purging = job.getBoolean(CRAWLDB_PURGE_404, false);
//标准被控制台输出
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb update: starting at " + sdf.format(start));
LOG.info("CrawlDb update: db: " + crawlDb);
LOG.info("CrawlDb update: segments: " + Arrays.asList(segments));
LOG.info("CrawlDb update: additions allowed: " + additionsAllowed);
LOG.info("CrawlDb update: URL normalizing: " + normalize);
LOG.info("CrawlDb update: URL filtering: " + filter);
LOG.info("CrawlDb update: 404 purging: " + url404Purging);
}
//这步骤是创建crawldb同级目录下面fetch和parse目录
for (int i = 0; i < segments.length; i++) {
Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME);
Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME);
//如果都存在了,那以后的数据可就往里倒了啊
if (fs.exists(fetch) && fs.exists(parse)) {
FileInputFormat.addInputPath(job, fetch);
FileInputFormat.addInputPath(job, parse);
} else {
//fetch和segment目录有存在的了,那就跳过去了,不创建了,这就是待遇,crawldb只要在就不行,fetch和segments就可以
LOG.info(" - skipping invalid segment " + segments[i]);
}
}
//这就行了,该有的路径都有了
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb update: Merging segment data into db.");
}
try {
//这就是要开始执行刚才说的那些作业了,能执行就执行,不行就IO异常,这阶段也只有IO异常
JobClient.runJob(job);
} catch (IOException e) {
//这块也看一下代码吧
// public static boolean removeLockFile(FileSystem fs, Path lockFile) throws IOException {
// if (!fs.exists(lockFile)) return false;
// if (fs.getFileStatus(lockFile).isDir())
// throw new IOException("lock file " + lockFile + " exists but is a directory!");
// return fs.delete(lockFile, false);
// }
//可以看出,;判断的意思就是crawldb这目录要是不存在(还没创建),话说也太严密了,不管了
//crawldb要是创建了,同时判断crawldb的目录要是存在,把crawldb删了[这里可以看出,如果到这出问题了,是创建再删的]
LockUtil.removeLockFile(fs, lock);
//记下了这个作业的输出路径
Path outPath = FileOutputFormat.getOutputPath(job);
//至于么......如果这个job的输出路径被创建了,那么把他删了。(因为map会生成本地文件)
if (fs.exists(outPath) ) fs.delete(outPath, true);
throw e;
}
//这回终于没事了,文件也都好了,install?源码在下面,先粘过来
// public static void install(JobConf job, Path crawlDb) throws IOException {
//把job属性里的db.preserve.backup打开了,并接受返回值(成功没)
// boolean preserveBackup = job.getBoolean("db.preserve.backup", true);
//这里做的就是我们爬取结束后看到的crawldb里面的两个文件current和old的创建过程
// Path newCrawlDb = FileOutputFormat.getOutputPath(job);
// FileSystem fs = new JobClient(job).getFs();
// Path old = new Path(crawlDb, "old");
// Path current = new Path(crawlDb, CURRENT_NAME);
// 这里,如果存在current了,同时old还存在了,那就把old删了,把current变为old,(更新过程)
// if (fs.exists(current)) {
// if (fs.exists(old)) fs.delete(old, true);
// fs.rename(current, old);
// }
// fs.mkdirs(crawlDb);
// fs.rename(newCrawlDb, current);
// if (!preserveBackup && fs.exists(old)) fs.delete(old, true);
// Path lock = new Path(crawlDb, LOCK_NAME);
// LockUtil.removeLockFile(fs, lock);
// }
CrawlDb.install(job, crawlDb);
//记录一下这些创建文件夹成功后的时间,输出到控制台
long end = System.currentTimeMillis();
LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
/*
* Configure a new CrawlDb in a temp folder at crawlDb/<rand>
*/
//驱动
public static JobConf createJob(Configuration config, Path crawlDb)
throws IOException {
//一个临时的Crawldb路径
Path newCrawlDb =
new Path(crawlDb,
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
//创建job,加载配置文件
JobConf job = new NutchJob(config);
//job名字
job.setJobName("crawldb " + crawlDb);
//crawldb目录下的current目录路径
Path current = new Path(crawlDb, CURRENT_NAME);
if (FileSystem.get(job).exists(current)) {
//如果存在了current,那么建立输入流
FileInputFormat.addInputPath(job, current);
}
//输入方式二进制输入
job.setInputFormat(SequenceFileInputFormat.class);
//指定Map的类和Reduce的类
job.setMapperClass(CrawlDbFilter.class);
job.setReducerClass(CrawlDbReducer.class);
//建立输出流
FileOutputFormat.setOutputPath(job, newCrawlDb);
//输出方式,这个MapFileOutputFormat有些特殊
//MapFile是基于SequenceFile开发,可以说是带索引版的SequenceFile。MapFile由两部分组成:data和index,均由SequenceFile实现。其中data会按照键值对的方式存储数据,
//index存储索引,主要记录key值和每个记录的偏移值。数据访问时,会先将索引文件加载到内存中,根据映射关系定位文件位置。
// 所以,MapFile是全局排序。MapFileOutputFormat实际上使用的仍是MapFile的reader。所以会根据MapFile的索引文件保证的顺序。
// 由于MapFile是一个排序的文件,典型的场景例如合并多个小文件,将小文件根据key值排序合并成大文件。
job.setOutputFormat(MapFileOutputFormat.class);
//输出key文件类型Text和Values的类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CrawlDatum.class);
// https://issues.apache.org/jira/browse/NUTCH-1110
job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
return job;
}
//这个地方上边有说过了,就是创建current和old的路径
public static void install(JobConf job, Path crawlDb) throws IOException {
boolean preserveBackup = job.getBoolean("db.preserve.backup", true);
Path newCrawlDb = FileOutputFormat.getOutputPath(job);
FileSystem fs = new JobClient(job).getFs();
Path old = new Path(crawlDb, "old");
Path current = new Path(crawlDb, CURRENT_NAME);
if (fs.exists(current)) {
if (fs.exists(old)) fs.delete(old, true);
fs.rename(current, old);
}
fs.mkdirs(crawlDb);
fs.rename(newCrawlDb, current);
if (!preserveBackup && fs.exists(old)) fs.delete(old, true);
Path lock = new Path(crawlDb, LOCK_NAME);
LockUtil.removeLockFile(fs, lock);
}
//主函数,通过ToolRunner的run方法内部调用GenericOptionsParser,用于解释hadoop命令
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDb(), args);
System.exit(res);
}
public int run(String[] args) throws Exception {
//这里主要是参数输入异常时,控制台上输出的信息
if (args.length < 1) {
System.err.println("Usage: CrawlDb <crawldb> (-dir <segments> | <seg1> <seg2> ...) [-force] [-normalize] [-filter] [-noAdditions]");
System.err.println("\tcrawldb\tCrawlDb to update");
System.err.println("\t-dir segments\tparent directory containing all segments to update from");
System.err.println("\tseg1 seg2 ...\tlist of segment names to update from");
System.err.println("\t-force\tforce update even if CrawlDb appears to be locked (CAUTION advised)");
System.err.println("\t-normalize\tuse URLNormalizer on urls in CrawlDb and segment (usually not needed)");
System.err.println("\t-filter\tuse URLFilters on urls in CrawlDb and segment");
System.err.println("\t-noAdditions\tonly update already existing URLs, don't add any newly discovered URLs");
return -1;
}
//getConf()装的是hadoop的配置信息,把这几个配置信息填上值
boolean normalize = getConf().getBoolean(CrawlDbFilter.URL_NORMALIZING, false);
boolean filter = getConf().getBoolean(CrawlDbFilter.URL_FILTERING, false);
boolean additionsAllowed = getConf().getBoolean(CRAWLDB_ADDITIONS_ALLOWED, true);
boolean force = false;
final FileSystem fs = FileSystem.get(getConf());
HashSet<Path> dirs = new HashSet<Path>();
//这是判断爬取命令里面输入的参数是不是含有这几个,上面它赋上了默认值,但是给了可以修改的机会,如果下面几个参数,默认值就会变成与原来相反的
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-normalize")) {
normalize = true;
} else if (args[i].equals("-filter")) {
filter = true;
} else if (args[i].equals("-force")) {
force = true;
} else if (args[i].equals("-noAdditions")) {
additionsAllowed = false;
} else if (args[i].equals("-dir")) {
//这里是各个文件路径,看一下HadoopFSUtil的源码部分
// public static PathFilter getPassDirectoriesFilter(final FileSystem fs) {
// return new PathFilter() {
// public boolean accept(final Path path) {
// try {
//返回创建指定参数的目录PathFilter类型
// return fs.getFileStatus(path).isDir();
// } catch (IOException ioe) {
// return false;
// }
//这里就是根据-dir创建指定的路径的集合
FileStatus[] paths = fs.listStatus(new Path(args[++i]), HadoopFSUtil.getPassDirectoriesFilter(fs));
//把这些路径放到set集合中
dirs.addAll(Arrays.asList(HadoopFSUtil.getPaths(paths)));
} else {
//如果以上的参数都不包含,简单的加入我们的输入的参数地址就好了
dirs.add(new Path(args[i]));
}
}
try {
//从这里就可以看出任务流程了
update(new Path(args[0]), dirs.toArray(new Path[dirs.size()]), normalize, filter, additionsAllowed, force);
return 0;
} catch (Exception e) {
LOG.error("CrawlDb update: " + StringUtils.stringifyException(e));
return -1;
}
}
}
//该程序的运行过程main方法调用重写的run函数,在run函数里面调用update,加上标记调用另一个update方法,在这个方法里面调用JobConf()和install()方法
//crawldb这个类主要的功能就是输入命令和参数后,创建目录,判断目录