ReactOS爬虫之再改版
http://www.0xaa55.com/thread-285-1-1.html上次给大家留了2个问题,现在揭晓
问题1:上面代码是栈实现的,请自己写出队列版代码
堆栈版:import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.net.URL;
import java.net.URLConnection;
import java.util.Stack;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.parser.Tag;
import org.jsoup.select.Elements;
public class ReactOSDownloaderStack
{
public static String url="http://svn.reactos.org/";
public static String path="k:/reactos/";
public static int filethreadnum=0;
public static boolean setinit=false;//是否强制初始化
public static String[] initstring={};//初始化目录位要开始更新的目录,按深度顺序
public static int curdepth=0;//当前初始化深度
public static Stack<FolderClass> folders=new Stack<FolderClass>();
public static class SerialElement implements Serializable
{
/**
*
*/
public String attrhref="";
private static final long serialVersionUID = 1L;
public SerialElement(Element ele)
{
attrhref=ele.attr("href");
}
}
public static class SerialData implements Serializable
{
/**
*
*/
public Stack<FolderClass> inner=new Stack<FolderClass>();
private static final long serialVersionUID = 1L;
public SerialData(Stack<FolderClass> data)
{
inner.clear();
inner.addAll(data);
}
public void ReadFromData(Stack<FolderClass> data)
{
data.clear();
data.addAll(inner);
}
}
public static class FileThread extends Thread
{
String filepath;
String curnode;
FileThread(String filepath,String curnode)
{
this.filepath=filepath;
this.curnode=curnode;
}
@Override
public void run()
{
try
{
while(filethreadnum>30)
{
sleep(1000);
}
filethreadnum++;
int byteread=0;
int bytesum=0;
URL weburl=new URL(url+filepath+curnode);
URLConnection con=weburl.openConnection();
InputStream instream=con.getInputStream();
FileOutputStream fs=new FileOutputStream(escape((path+filepath+curnode).replace("%20"," ")));
byte[] buffer=new byte;
while((byteread=instream.read(buffer)) != -1)
{
bytesum+=byteread;
fs.write(buffer,0,byteread);
System.out.println("\t\t当前下载文件:"+filepath+curnode+"\t当前大小:"+bytesum);
}
fs.close();
instream.close();
filethreadnum--;
}
catch(Exception e)
{
System.out.println("error");
filethreadnum--;
new File(path+filepath+curnode).deleteOnExit();;
}
}
}
public static String escape(String src)
{
StringBuffer sbuf=new StringBuffer();
int len=src.length();
for(int i=3;i<len;i++)
{
char ch=src.charAt(i);
if(ch == '\\' || ch == ':' || ch == '*' || ch == '?' || ch == '"' || ch == '<' || ch == '>' || ch == '|')
;
else
sbuf.append(ch);
}
return src.substring(0,3)+formatpath(sbuf.toString());
}
public static String formatpath(String src)
{
if(!src.contains("http"))
return src;
String newstr=src.substring(0,10)+src.substring(10).replaceAll("/{2,}", "/");
return newstr;
}
public static String createFolder(String folderPath)
{
String txt = folderPath;
try
{
File myFilePath = new File(txt);
txt = folderPath;
if (!myFilePath.exists())
{
myFilePath.mkdir();
}
}
catch (Exception e)
{
System.out.println("错误!");
}
return txt;
}
public static class FolderClass implements Serializable
{
String filepath;
SerialElement e;
public FolderClass(SerialElement e,String filepath)
{
this.filepath=filepath;
this.e=e;
}
public void ResolveFolder()
{
try
{
String curnode=e.attrhref;
if(curnode.indexOf(';') != -1 || curnode.charAt(0) == '/' || curnode.equals("../") || curnode.equals("svn/"))
return;
System.out.println(curnode+"\t"+filepath);
if(setinit)
{
if(!curnode.equals(initstring))
return;
else
curdepth++;
if(curdepth >= initstring.length)
setinit=false;
}
if(curnode.charAt(curnode.length()-1) == '/')
{//目录
createFolder(escape((path+filepath+curnode).replace("%20"," ")));
Document doc=Jsoup.connect(formatpath(url+filepath+curnode)).timeout(0).get();
System.out.println("当前目录:"+url+formatpath(filepath)+curnode);
Elements items=doc.select("tbody tr a");
for(Element ele1:items)
{
folders.push(new FolderClass(new SerialElement(ele1),filepath+curnode));
}
items.clear();
items=doc.select("ul li a");
for(Element ele2:items)
{
folders.push(new FolderClass(new SerialElement(ele2),filepath+curnode));
}
}
else
{//文件
File curfile=new File((path+filepath+curnode).replace("%20"," "));
if(curfile.exists())
return;
(new FileThread(filepath,curnode)).start();
}
}
catch(Exception e)
{
System.out.println("error");
}
}
}
public static void main(String[] args) throws IOException
{
try
{
if(new File("savedata").exists())
{
FileInputStream fis=new FileInputStream("savedata");
ObjectInputStream ois=new ObjectInputStream(fis);
SerialData data=(SerialData) ois.readObject();
data.ReadFromData(folders);
ois.close();
fis.close();
}
else
{
Document doc = Jsoup.connect(url).timeout(0).get();
Elements items=doc.select("tbody tr a");
createFolder(path);
for(Element e1:items)
{
folders.push(new FolderClass(new SerialElement(e1),""));
}
items=doc.select("ul li a");
for(Element e2:items)
{
folders.push(new FolderClass(new SerialElement(e2),""));
}
new File("savedata").createNewFile();
}
new Thread()
{
@Override
public void run()
{
while(true)
{
try
{
sleep(1000);
FileOutputStream fos=new FileOutputStream("savedata");
ObjectOutputStream oos=new ObjectOutputStream(fos);
SerialData data=new SerialData(folders);
oos.writeObject(data);
oos.close();
fos.close();
}
catch(Exception e)
{
}
}
}
}.start();
new Thread()
{
@Override
public void run()
{
while(!folders.empty())
{
int searchonce=30;
while(searchonce-- > 0 && !folders.isEmpty())
{
folders.pop().ResolveFolder();
}
}
}
}.start();
while(filethreadnum != 0 || folders.size() != 0)
{
Thread.sleep(1000);
}
}
catch(Exception exc)
{
System.out.println("错误!"+exc.getMessage());
}
}
}队列版:import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.LinkedList;
import java.util.Queue;
import java.util.Stack;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class ReactOSDownloaderQueue
{
public static String url="http://svn.reactos.org/";
public static String path="k:/reactos/";
public static int filethreadnum=0;
public static boolean setinit=false;//是否强制初始化
public static String[] initstring={};//初始化目录位要开始更新的目录,按深度顺序
public static int curdepth=0;//当前初始化深度
public static Queue<FolderClass> folders=new LinkedList<FolderClass>();
public static class FileThread extends Thread
{
String filepath;
String curnode;
FileThread(String filepath,String curnode)
{
this.filepath=filepath;
this.curnode=curnode;
}
@Override
public void run()
{
try
{
while(filethreadnum>100)
{
sleep(1000);
}
filethreadnum++;
int byteread=0;
int bytesum=0;
URL weburl=new URL(url+filepath+curnode);
URLConnection con=weburl.openConnection();
InputStream instream=con.getInputStream();
FileOutputStream fs=new FileOutputStream(escape((path+filepath+curnode).replace("%20"," ")));
byte[] buffer=new byte;
while((byteread=instream.read(buffer)) != -1)
{
bytesum+=byteread;
fs.write(buffer,0,byteread);
System.out.println("\t\t当前下载文件:"+filepath+curnode+"\t当前大小:"+bytesum);
}
fs.close();
instream.close();
filethreadnum--;
}
catch(Exception e)
{
System.out.println("error");
filethreadnum--;
new File(path+filepath+curnode).deleteOnExit();;
}
}
}
public static String escape(String src)
{
StringBuffer sbuf=new StringBuffer();
int len=src.length();
for(int i=3;i<len;i++)
{
char ch=src.charAt(i);
if(ch == '\\' || ch == ':' || ch == '*' || ch == '?' || ch == '"' || ch == '<' || ch == '>' || ch == '|')
;
else
sbuf.append(ch);
}
return src.substring(0,3)+formatpath(sbuf.toString());
}
public static String formatpath(String src)
{
if(!src.contains("http"))
return src;
String newstr=src.substring(0,10)+src.substring(10).replaceAll("/{2,}", "/");
return newstr;
}
public static String createFolder(String folderPath)
{
String txt = folderPath;
try
{
File myFilePath = new File(txt);
txt = folderPath;
if (!myFilePath.exists())
{
myFilePath.mkdir();
}
}
catch (Exception e)
{
System.out.println("错误!");
}
return txt;
}
public static class FolderClass
{
String filepath;
Element e;
FolderClass(Element e,String filepath)
{
this.filepath=filepath;
this.e=e;
}
public void ResolveFolder()
{
try
{
String curnode=e.attr("href");
if(curnode.indexOf(';') != -1 || curnode.charAt(0) == '/' || curnode.equals("../") || curnode.equals("svn/"))
return;
System.out.println(curnode+"\t"+filepath);
if(setinit)
{
if(!curnode.equals(initstring))
return;
else
curdepth++;
if(curdepth >= initstring.length)
setinit=false;
}
if(curnode.charAt(curnode.length()-1) == '/')
{//目录
createFolder(escape((path+filepath+curnode).replace("%20"," ")));
Document doc=Jsoup.connect(formatpath(url+filepath+curnode)).timeout(0).get();
System.out.println("当前目录:"+url+formatpath(filepath)+curnode);
Elements items=doc.select("tbody tr a");
for(Element ele1:items)
{
folders.offer(new FolderClass(ele1,filepath+curnode));
}
items.clear();
items=doc.select("ul li a");
for(Element ele2:items)
{
folders.offer(new FolderClass(ele2,filepath+curnode));
}
}
else
{//文件
File curfile=new File((path+filepath+curnode).replace("%20"," "));
if(curfile.exists())
return;
(new FileThread(filepath,curnode)).start();
}
}
catch(Exception e)
{
System.out.println("error");
}
}
}
public static void main(String[] args) throws IOException
{
try
{
Document doc = Jsoup.connect(url).timeout(0).get();
Elements items=doc.select("tbody tr a");
createFolder(path);
for(Element e1:items)
{
folders.offer(new FolderClass(e1,""));
}
items=doc.select("ul li a");
for(Element e2:items)
{
folders.offer(new FolderClass(e2,""));
}
new Thread()
{
@Override
public void run()
{
while(!folders.isEmpty())
{
int searchonce=(folders.size()>100)?100:folders.size();
while(searchonce-- > 0)
{
folders.poll().ResolveFolder();
}
}
}
}.start();
while(filethreadnum != 0)
{
Thread.sleep(1000);
}
}
catch(Exception exc)
{
System.out.println("错误!");
}
}
}问题2:代码中实现了一种功能,它允许你今天下载了一部分reactos代码,然后结束java程序,第二天开电脑的时候只需要设置以下参数,就可以从昨天开始处继续运行,请问代码中如何实现的
在最早的版本(非队列非堆栈版)里,这种继续下载的机制是通过记录目录实现的
关键代码在:
public static boolean setinit=false;//是否强制初始化
。。。。。
if(setinit)
{
if(!curnode.equals(initstring))
return;
else
curdepth++;
if(curdepth >= initstring.length)
setinit=false;
}
而在上面的堆栈版,是通过java独有的序列化方式实现的,,每隔1秒将堆栈数据写入文件,启动时读出。序列化和反射是java最强大的2各功能,其中序列化可以保存任何东西,包括类
关键代码在:
public static class SerialElement implements Serializable
{
/**
*
*/
public String attrhref="";
private static final long serialVersionUID = 1L;
public SerialElement(Element ele)
{
attrhref=ele.attr("href");
}
}
public static class SerialData implements Serializable
{
/**
*
*/
public Stack<FolderClass> inner=new Stack<FolderClass>();
private static final long serialVersionUID = 1L;
public SerialData(Stack<FolderClass> data)
{
inner.clear();
inner.addAll(data);
}
public void ReadFromData(Stack<FolderClass> data)
{
data.clear();
data.addAll(inner);
}
}
。。。
if(new File("savedata").exists())
{
FileInputStream fis=new FileInputStream("savedata");
ObjectInputStream ois=new ObjectInputStream(fis);
SerialData data=(SerialData) ois.readObject();
data.ReadFromData(folders);
ois.close();
fis.close();
}
。。。
new Thread()
{
@Override
public void run()
{
while(true)
{
try
{
sleep(1000);
FileOutputStream fos=new FileOutputStream("savedata");
ObjectOutputStream oos=new ObjectOutputStream(fos);
SerialData data=new SerialData(folders);
oos.writeObject(data);
oos.close();
fos.close();
}
catch(Exception e)
{
}
}
}
}.start();
经测试,性能没有较大改变,且输出的文件总在7k以内,说明堆栈项目最多在200,很少
这次给你们留的问题so-easy,
①我只实现了堆栈版的序列化,现在你们实现队列版的序列化
②对比堆栈版和队列版运行速度,并说明原因
不明觉厉~~mark
页:
[1]