ReactOS爬虫改进版

元始天尊 · 发表于 2014-2-28 13:20:24

欢迎访问技术宅的结界，请注册或者登录吧。

您需要登录才可以下载或查看，没有账号？立即注册→加入我们

×

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.Stack;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class ReactOSDownloader
{
public static String url="http://svn.reactos.org/";
public static String path="k:/reactos/";
public static int filethreadnum=0;
public static boolean setinit=false;//是否强制初始化
public static String[] initstring={};//初始化目录位要开始更新的目录，按深度顺序
public static int curdepth=0;//当前初始化深度
public static Stack<FolderClass> folders=new Stack<FolderClass>();
public static class FileThread extends Thread
{
String filepath;
String curnode;
FileThread(String filepath,String curnode)
{
this.filepath=filepath;
this.curnode=curnode;
}
@Override
public void run()
{
try
{
while(filethreadnum>30)
{
sleep(1000);
}
filethreadnum++;
int byteread=0;
int bytesum=0;
URL weburl=new URL(url+filepath+curnode);
URLConnection con=weburl.openConnection();
InputStream instream=con.getInputStream();
FileOutputStream fs=new FileOutputStream(escape((path+filepath+curnode).replace("%20"," ")));
byte[] buffer=new byte[65536];
while((byteread=instream.read(buffer)) != -1)
{
bytesum+=byteread;
fs.write(buffer,0,byteread);
System.out.println("\t\t当前下载文件："+filepath+curnode+"\t当前大小："+bytesum);
}
fs.close();
instream.close();
filethreadnum--;
}
catch(Exception e)
{
System.out.println("error");
filethreadnum--;
new File(path+filepath+curnode).deleteOnExit();;
}
}
}
public static String escape(String src)
{
StringBuffer sbuf=new StringBuffer();
int len=src.length();
for(int i=3;i<len;i++)
{
char ch=src.charAt(i);
if(ch == '\\' || ch == ':' || ch == '*' || ch == '?' || ch == '"' || ch == '<' || ch == '>' || ch == '|')
;
else
sbuf.append(ch);
}
return src.substring(0,3)+formatpath(sbuf.toString());
}
public static String formatpath(String src)
{
if(!src.contains("http"))
return src;
String newstr=src.substring(0,10)+src.substring(10).replaceAll("/{2,}", "/");
return newstr;
}
public static String createFolder(String folderPath)
{
String txt = folderPath;
try
{
File myFilePath = new File(txt);
txt = folderPath;
if (!myFilePath.exists())
{
myFilePath.mkdir();
}
}
catch (Exception e)
{
System.out.println("错误!");
}
return txt;
}
public static class FolderClass
{
String filepath;
Element e;
FolderClass(Element e,String filepath)
{
this.filepath=filepath;
this.e=e;
}
public void ResolveFolder()
{
try
{
String curnode=e.attr("href");
if(curnode.indexOf(';') != -1 || curnode.charAt(0) == '/' || curnode.equals("../") || curnode.equals("svn/"))
return;
System.out.println(curnode+"\t"+filepath);
if(setinit)
{
if(!curnode.equals(initstring[curdepth]))
return;
else
curdepth++;
if(curdepth >= initstring.length)
setinit=false;
}
if(curnode.charAt(curnode.length()-1) == '/')
{//目录
createFolder(escape((path+filepath+curnode).replace("%20"," ")));
Document doc=Jsoup.connect(formatpath(url+filepath+curnode)).timeout(0).get();
System.out.println("当前目录："+url+formatpath(filepath)+curnode);
Elements items=doc.select("tbody tr a");
for(Element ele1:items)
{
folders.push(new FolderClass(ele1,filepath+curnode));
}
items.clear();
items=doc.select("ul li a");
for(Element ele2:items)
{
folders.push(new FolderClass(ele2,filepath+curnode));
}
}
else
{//文件
File curfile=new File((path+filepath+curnode).replace("%20"," "));
if(curfile.exists())
return;
(new FileThread(filepath,curnode)).start();
}
}
catch(Exception e)
{
System.out.println("error");
}
}
}
public static void main(String[] args) throws IOException
{
try
{
Document doc = Jsoup.connect(url).timeout(0).get();
Elements items=doc.select("tbody tr a");
createFolder(path);
for(Element e1:items)
{
folders.push(new FolderClass(e1,""));
}
items=doc.select("ul li a");
for(Element e2:items)
{
folders.push(new FolderClass(e2,""));
}
new Thread()
{
@Override
public void run()
{
while(!folders.empty())
{
int searchonce=30;
while(searchonce-- > 0)
{
folders.pop().ResolveFolder();
}
}
}
}.start();
while(filethreadnum != 0 || folders.size() != 0)
{
Thread.sleep(1000);
}
}
catch(Exception exc)
{
System.out.println("错误!");
}
}
}

复制代码

以前那个是只能多线程文件下载，现在支持多线程文件夹解析，为了实现这个用到了栈这种数据结构，之所以用栈是有原因的，我先考虑的是队列，因为写搜索工具用的是这个数据结构。
现在来分析为什么要用栈而不用队列，假设目录情况是这样的，从上到下为父子关系树：
                                       A
                     B                                  C
      D                   E                F                G
H          I       J       K       L    M       N       O
P  Q       R S T  U    V  W X  Y Z  A B  C    D  E
目录只能逐层遍历，且只能按照先序遍历，这是因为作为下载器，磁盘上只有建立了父文件夹才能建立子文件夹，本身是递归的。
遍历顺序如下；ABDHPQIRSEJTUKVWCFLXYMZAGNBCODE
因为目录众多，因此多线程需要设定一个线程数量上限，多线程也就是多任务，现在得目的是在解析完当前文件夹时将子文件夹加入任务，那么顺序问题是值得考虑的，因为搞不好，会导致2种情况：1.任务太多，占用内存过多，2.由于设定了上限，会造成死锁，也就是任务等待加入自己的子任务以后才能结束，也就是等这个上限值低于阈值时结束，而该阈值要等该任务结束才能降低。针对这种情况，可以选择2种数据结构：栈和队列

使用队列和栈栈解析目录的时候，把所有子目录加入任务，相应地主函数启动线程用于定时取出一定数量文件夹进行解析，这样的好处是，取出的一定是子文件夹，这样递归以后的结果是先递归完子文件夹再递归其他兄弟节点，不至于栈溢出。
下面是同时进行的任务数为2的推演：
使用栈的结果为：A->CB->GFED->ONMLED->DECBMLED->AZYXED->KJIH->WVUTIH->SRQP->结束
使用队列的结果为：A->BC->DEFG->HIJKFG->PQRSJKFG->TUVWFG->LMNO->XYZANO->BCDE->结束

经测试，速度很快

问题1：上面代码是栈实现的，请自己写出队列版代码
问题2：代码中实现了一种功能，它允许你今天下载了一部分reactos代码，然后结束java程序，第二天开电脑的时候只需要设置以下参数，就可以从昨天开始处继续运行，请问代码中如何实现的

我有个梦 · 发表于 2014-3-26 16:23:50

爬ReactOs的。学习了

账号		自动登录	找回密码
密码			立即注册→加入我们

ReactOS爬虫改进版

欢迎访问技术宅的结界，请注册或者登录吧。

浏览过的版块