因为最近需要很多图片素材做机器训练,所以写一个爬虫来下载百度图片里的一些图片,用作机器学习。
先看看效果图:
实现思路:根据百度图片的Ajax请求,用Chrome获取到URL,然后通过改变URL参数,获得百度图片Ajax请求的JSON。解析获得每一张图片的URL。
爬虫启动后,新建两百个线程,用来并发下载图片,充分利用带宽。
使用ConcurrentLinkedQueue创建队列,将第一步中解析出的链接添入。
第二步启动的两百个线程监听上面的队列,从中取出图片链接并下载。
因为ConcurrentLinkedQueue是线程安全的,所以不用担心下载重复了。
项目一共有三个核心文件,一个Main函数启动文件。代码如下。
Download.javaimport java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;
/**
* Created by 杨小龙 on 2016-07-24.
* 邮箱:1172875805@qq.com
*/
public class Download {
private String urlString;
private String savePath;
private String filename;
public Download(String urlString,String savePath,String filename){
this.urlString = urlString;
this.savePath = savePath;
this.filename = filename;
}
public void downloadCore(){
File sf=new File(this.savePath);
try{
// 构造URL
URL url = new URL(this.urlString);
// 打开连接
URLConnection con = url.openConnection();
//设置请求超时为5s
con.setConnectTimeout(5*1000);
// 输入流
InputStream is = con.getInputStream();
// 1K的数据缓冲
byte[] bs = new byte[1024];
// 读取到的数据长度
int len;
// 输出的文件流
if(!sf.exists()){
sf.mkdirs();
}
OutputStream os = new FileOutputStream(sf.getPath()+"\\"+this.filename);
// 开始读取
while ((len = is.read(bs)) != -1) {
os.write(bs, 0, len);
}
// 完毕,关闭所有链接
os.close();
is.close();
}catch (Exception e){
// e.printStackTrace();
if(sf.exists()){
sf.delete();
}
System.out.println("问题图片!跳过!");
}
}
}
DownloadGirl.javaimport org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.map.ObjectMapper;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.util.concurrent.ConcurrentLinkedQueue;
/**
* Created by 杨小龙 on 2016-07-24.
* 邮箱:1172875805@qq.com
*/
public class DownloadGirl {
private static ConcurrentLinkedQueue queue = new ConcurrentLinkedQueue();
private static int threadNum = 0;
public static void startThread(){
//启动两百个线程,监听队列
while(threadNum<200){
Thread thread = new Thread(new DownloadThread(queue));
thread.start();
threadNum++;
}
}
public static void getImgUrl(){
int begin=0;
while (true){
String url = "http://p_w_picpath.baidu.com/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word=%E4%B8%9D%E8%A2%9C%E7%BE%8E%E5%A5%B3&cg=girl&pn="+begin+"&rn=30&itg=0&z=0&fr=&lm=-1&ic=0&s=0&st=-1&gsm=4d0d0000005a";
try{
Connection con = Jsoup.connect(url);
Document doc = con.ignoreContentType(true).timeout(30000).get();
String json = doc.text();
ObjectMapper mapper = new ObjectMapper();
JsonNode node = mapper.readTree(json);
JsonNode imgs = node.get("imgs");
if(imgs.size()==0){
break;
}
for(JsonNode item:imgs){
String temp_url = item.get("objURL").asText();
if(!temp_url.equals("")){
queue.offer(temp_url);
}else{
System.out.println("空连接!!!!");
}
}
begin=begin+30;
System.out.println("完成一百!");
if (!queue.isEmpty()){
Thread.sleep(1000);
System.out.println("等待一秒!!!");
}
}catch (Exception e){
// e.printStackTrace();
System.out.println(url);
begin=begin+30;
}
}
}
}
DownloadThread.javaimport java.util.UUID;
import java.util.concurrent.ConcurrentLinkedQueue;
/**
* Created by 杨小龙 on 2016-07-24.
* 邮箱:1172875805@qq.com
*/
public class DownloadThread implements Runnable {
private ConcurrentLinkedQueue queue;
public DownloadThread(ConcurrentLinkedQueue queue){
this.queue = queue;
}
public void run(){
while (true){
String url = queue.poll();
while(url == null){
try{
Thread.sleep(1000);
}catch (Exception e){
e.printStackTrace();
}
finally {
url = queue.poll();
}
}
String uuid = UUID.randomUUID().toString();
String index = uuid.substring(0,2);
Download download = new Download(url, "./data/"+index+"/", uuid+".jpg");
download.downloadCore();
}
}
}
最后的文件是Main文件,用来启动该程序!
Test.java/**
* Created by 杨小龙 on 2016/7/3.
* 邮箱:1172875805@qq.com
*/
public class Test {
public static void main(String[] args) throws Exception{
DownloadGirl.startThread();
DownloadGirl.getImgUrl();
}
}