这几天在学习Java解析xml,突然想到Dom能不能解析html,结果试了半天行不通,然后就去查了一些资料,发现很多人都在用Jsoup解析html文件,然后研究了一下,写了一个简单的实例,感觉还有很多地方需要润色,在这里分享一下我的实例,欢迎交流指教!后续想通过Java把数据导入到Excel或者生成一个报表!
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;/**从智联招聘获取招聘信息* &#64;url 智联招聘网站链接&#xff08;建议不要更改&#xff09;* &#64;city 搜索工作的城市* &#64;keywrods 搜索工作的相关关键字*/public class JsoupHtml {private String url&#61;"http://sou.zhaopin.com/jobs/searchresult.ashx?jl&#61;"; //智联招聘网站private String city&#61;"西安"; //搜索工作的城市private String keywords&#61;"java"; //搜索工作的关键字public JsoupHtml(String city,String keywords){ this.city&#61;city;this.keywords &#61;keywords;}public void getZhiLianWork(){try {for (int i&#61;0;i<10;i&#43;&#43;) {System.out.println("*********开始遍历第"&#43;(i&#43;1)&#43;"页的求职信息*********");Document doc &#61; Jsoup.connect(url&#43;city&#43;"&kw&#61;"&#43;keywords&#43;"&p&#61;"&#43;(i&#43;1)&#43;"&isadv&#61;0").get(); Element content &#61; doc.getElementById("newlist_list_content_table"); Elements zwmcEls &#61; content.getElementsByClass("zwmc");Elements gsmcEls &#61; content.getElementsByClass("gsmc"); Elements zwyxEls &#61; content.getElementsByClass("zwyx"); Elements gzddEls &#61; content.getElementsByClass("gzdd"); Elements gxsjEls &#61; content.getElementsByClass("gxsj");for(int j &#61; 0;j 更新源代码&#xff0c;支持生成html表格&#xff1a;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;public class JsoupHtml {public static void main(String[] args) { try {String url &#61;"http://sou.zhaopin.com/jobs/searchresult.ashx?";String city &#61;"西安";String keywords &#61; "java";BufferedWriter bWriter &#61; new BufferedWriter(new OutputStreamWriter(new FileOutputStream("output.html"),"utf-8"));bWriter.write("");File input &#61; new File("input.html");Document doc2 &#61; Jsoup.parse(input, "UTF-8", "");Element table &#61; doc2.getElementById("workinfo");table.text("");Element theader &#61; table.appendElement("tr");theader.appendElement("th").text("序号");theader.appendElement("th").text("职位名称");theader.appendElement("th").text("公司名称");theader.appendElement("th").text("职位月薪");theader.appendElement("th").text("工作地点");theader.appendElement("th").text("发布日期"); for(int page&#61;0;page<10;page&#43;&#43;){ Document doc &#61; Jsoup.connect(url&#43;city&#43;"&kw&#61;"&#43;keywords&#43;"&p&#61;"&#43;page).get(); Element content &#61; doc.getElementById("newlist_list_content_table"); Elements zwmcEls &#61; content.getElementsByClass("zwmc");Elements gsmcEls &#61; content.getElementsByClass("gsmc"); Elements zwyxEls &#61; content.getElementsByClass("zwyx"); Elements gzddEls &#61; content.getElementsByClass("gzdd"); Elements gxsjEls &#61; content.getElementsByClass("gxsj");for(int i &#61; 1;i output.html模板&#xff1a;
智联工作信息
版权所有 翻版必究&#64;2018 Joker