【java爬虫】爬虫+基于接口的网络爬虫

爬虫+基于接口的网络爬虫

上一篇讲了【java爬虫】---爬虫+jsoup轻松爬博客，该方式有个很大的局限性，就是你通过jsoup爬虫只适合爬静态网页，所以只能爬当前页面的所有新闻。如果需要爬一个网站所有信息，就得通过接口，通过改变参数反复调该网站的接口，爬到该网站的所有数据信息。

本博客以爬金色财经新闻信息为对象，去爬取该网站从建站以来发表的所有新闻信息。下面会一步一步讲解。这里重点重点讲思路，最后我会提供完整源码。

第一步：找接口

你要获得该网站所有新闻数据，第一步当然是获得接口，通过接口来获取所有信息。

F12-->Network-->all，找到接口：https://api.jinse.com/v4/information/listcatelogue_key=news&limit=23&information_id=56630&flag=down&version=9.9.9

对这三个参数做个说明：

limit=23 代表每次调用该接口返回23条数据。

information_id=56630 代表下面返回的23条数据是通过大于56630或者小于56630这个ID指来返回数据。

flag=down 代表向下翻页这里也就是指ID小于56630的23条数据。

通过postMan测试

输入：https://api.jinse.com/v4/information/list?catelogue_key=news&limit=2&information_id=0&flag=down&version=9.9.9（这里返回两条，id=0这里代表最新的两条数据）

返回json数据格式：

{
    "news": 2,
    "count": 2,
    "total": null,
    "top_id": 58300,
    "bottom_id": 58325,
    "list": [
        {
            "id": 58300,
            "title": "跨越牛熊的摆渡人：看金融IT服务如何助力加密货币交易",
            "short_title": "当传统金融IT服务商进入加密货币时代",
            "type": 1,
            "order": 0,
            "is_top": false,
            "extra": {
                "version": "9.9.9",
                "summary": "存量资金与投资者日渐枯竭，如何获取新用户和新资金入场，成为大小交易所都在考虑的问题。而交易深度有限、流动性和行情稳定性不佳，也成为横亘在牛熊之间的一道障碍。",
                "published_at": 1532855806,
                "author": "临渊",
                "author_avatar": "https://img.jinse.com/753430_image20.png",
                "author_id": 127939,
                "author_level": 1,
                "read_number": 27064,
                "read_number_yuan": "2.7万",
                "thumbnail_pic": "https://img.jinse.com/996033_image1.png",
                "thumbnails_pics": [
                    "https://img.jinse.com/996033"
                ],
                "thumbnail_type": 1,
                "source": "金色财经",
                "topic_url": "https://m.jinse.com/news/blockchain/219916.html",
                "attribute_exclusive": "",
                "attribute_depth": "深度",
                "attribute_spread": ""
            }
        },
        {
            "id": 58325,
            "title": "各路大佬怎样看待区块链：技术新武器应寻找新战场",
            "short_title": "各路大佬怎样看待区块链：技术新武器应寻找新战场",
            "type": 1,
            "order": 0,
            "is_top": false,
            "extra": {
                "version": "9.9.9",
                "summary": "今年年初由区块链社区引发的讨论热潮,成为全民一时热议的话题,罕有一项技术,能像区块链这样——在其应用还未大范围铺开、被大众直观感知时,就搅起舆论风暴,扰动民众情绪。",
                "published_at": 1532853425,
                "author": "新浪财经",
                "author_avatar": "https://img.jinse.com/581794_image20.png",
                "author_id": 94556,
                "author_level": 5,
                "read_number": 33453,
                "read_number_yuan": "3.3万",
                "thumbnail_pic": "https://img.jinse.com/995994_image1.png",
                "thumbnails_pics": [
                    "https://img.jinse.com/995994"
                ],
                "thumbnail_type": 1,
                "source": "新浪财经",
                "topic_url": "https://m.jinse.com/blockchain/219934.html",
                "attribute_exclusive": "",
                "attribute_depth": "",
                "attribute_spread": ""
            }
        }
    ]
}

接口返回信息

第二步：通过定时任务开启爬虫工作

@Slf4j
@Component
public class SchedulePressTrigger {

    @Autowired
    private CrawlerJinSeLivePressService crawlerJinSeLivePressService;

    /**
    * 定时抓取金色财经的新闻
    */
    @Scheduled(initialDelay = 1000, fixedRate = 600 * 1000)
    public void doCrawlJinSeLivePress() {

      //  log.info("开始抓取金色财经新闻, time:" + new Date());
        try {
            crawlerJinSeLivePressService.start();
        } catch (Exception e) {
          //  log.error("本次抓取金色财经新闻异常", e);
        }
      //  log.info("结束抓取金色财经新闻, time:" + new Date());
    }
}

第三步：主要实现类

/**
 * 抓取金色财经快讯
 * @author xub
 * @since 2018/6/29
 */
@Slf4j
@Service
public class CrawlerJinSeLivePressServiceImpl extends AbstractCrawlLivePressService implements
        CrawlerJinSeLivePressService {

    //这个参数代表每一次请求获得多少个数据
    private static final int PAGE_SIZE = 15;

    //这个是真正翻页参数，每一次找id比它小的15个数据（有写接口是通过page=1，2来进行翻页所以比较好理解一点，其实它们性质一样）
    private long bottomId;


    //这个这里没有用到，但是如果有数据层，就需要用到，这里我只是把它答应到控制台
    @Autowired
    private LivePressService livePressService;
    
    
    
    //定时任务运行这个方法，doTask没有被重写，所有运行父类的方法
    @Override
    public void start() {
        try {
            doTask(CoinPressConsts.CHAIN_FOR_LIVE_PRESS_DATA_URL_FORMAT);
        } catch (IOException e) {
          //  log.error("抓取金色财经新闻异常", e);
        }
    }


    @Override
    protected List crawlPage(int pageNum) throws IOException {
        // 最多抓取100页，多抓取也没有特别大的意思。
        if (pageNum >= 100) {
            return Collections.emptyList();
        }
        // 格式化翻页参数（第一次bottomId为0，第二次就是这次爬到的最小bottomId值）
        String requestUrl = String.format(CoinPressConsts.CHAIN_FOR_LIVE_PRESS_DATA_URL_FORMAT, PAGE_SIZE, bottomId);

        Response response = OkHttp.singleton().newCall(
                new Request.Builder().url(requestUrl).addHeader("referer", CoinPressConsts.CHAIN_FOR_LIVE_URL).get().build())
                .execute();
        if (response.isRedirect()) {
            // 如果请求发生了跳转，说明请求不是原来的地址了，返回空数据。
            return Collections.emptyList();
        }

        //先获得json数据格式
        String respOnseText= response.body().string();

        //在通过工具类进行数据赋值
        JinSePressResult jinSepressResult = JsonUtils.objectFromJson(responseText, JinSePressResult.class);
        if (null == jinSepressResult) {
            // 反序列化失败
            System.out.println("抓取金色财经新闻列表反序列化异常");
            return Collections.emptyList();
        }
        // 取金色财经最小的记录id，来进行翻页
        bottomId = jinSepressResult.getBottomId();

        //这个是谷歌提供了guava包里的工具类，Lists这个集合工具，对list集合操作做了些优化提升。
        List pageListPresss = Lists.newArrayListWithExpectedSize(PAGE_SIZE);

        for (JinSePressResult.DayData dayData : jinSepressResult.getList()) {
            JinSePressData data = dayData.getExtra();
            //新闻发布时间（时间戳格式）这里可以来判断只爬多久时间以内的新闻
            long   createTime = data.getPublishedAt() * 1000;
            Long timemill=System.currentTimeMillis();
//           if (System.currentTimeMillis() - createTime > CoinPressConsts.MAX_CRAWLER_TIME) {
//               // 快讯过老了，放弃
//               continue;
//           }
            SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
            String sd = sdf.format(new Date(createTime));   // 时间戳转换成时间
            Date newsCreateTime=new Date();
            try {
                //获得新闻发布时间
                newsCreateTime = sdf.parse(sd);
            } catch (ParseException e) {
                e.printStackTrace();
            }
            //具体文章页面路径（这里可以通过这个路径+jsoup就可以爬新闻正文所有信息了）
            String href = data.getTopicUrl();
            //新闻摘要
            String summary = data.getSummary();
            //新闻阅读数量
            String pres-s-readcount = data.getReadNumber();
            //新闻标题
            String title = dayData.getTitle();

                pageListPresss.add(new PageListPress(href,title, Integer.parseInt(pres-s-readcount),
                        newsCreateTime ,  summary));
        }
        return pageListPresss;
    }
}

AbstractCrawlLivePressService 类

 public abstract class AbstractCrawlLivePressService {
    String url;
    public void doTask(String url) throws IOException {
        this.url = url;
        int pageNum = 1;

        //通过 while (true)会一直循环调取接口，直到数据为空或者时间过老跳出循环
        while (true) {
            List newsList = crawlPage(pageNum++);
            // 抓取不到新的内容本次抓取结束
            if (CollectionUtils.isEmpty(newsList)) {
                break;
            }    
            //这里并没有把数据放到数据库，而是直接从控制台输出
            for (int i = newsList.size() - 1; i >= 0; i--) {
                PageListPress pageListNews = newsList.get(i);
                System.out.println(pageListNews.toString());
          
            }
        }
    }
    //这个由具体实现类实现
    protected abstract List crawlPage(int pageNum) throws IOException;

@Data
@AllArgsConstructor
@NoArgsConstructor
    public static class PageListPress {

        //新闻详情页面url
         private String href;
       //新闻标题
        private String title;
        //新闻阅读数量
        private int readCounts;
        //新闻发布时间
        private Date createTime;
        //新闻摘要
        private String summary;
    
        }
 }

JinSePressResult

/**
 *在创建对象的时候一定要分析好json格式的类型
 *金色新闻的返回格式就是第一层有普通属性和一个list集合
 *在list集合中又有普通属性和一个extra的对象。
 */
@JsonIgnoreProperties(ignoreUnknown = true)
@Data
public class JinSePressResult {

    private int news;
    private int count;
    @JsonProperty("top_id")
    private long topId;
    @JsonProperty("bottom_id")
    private long bottomId;
    //list的名字也要和json数据的list名字一致，否则无用
    private List list;

    @Data
    @JsonIgnoreProperties(ignoreUnknown = true)
    public static class DayData {

        private String title;
        //这里对象的属性名extra也要和json的extra名字一致
        private JinSePressData extra;
        @JsonProperty("topic_url")
        private String topicUrl;
    }
}