使用JSoup+CSSPath采集和讯网人物信息
?
代码见github
?
模型类:
?
public class Person {
private String name;
//基本信息
private Map<String, String> basicInfos;
//教育经历
List<String> educations;
//工作经历
List<String> jobs;
//重要事件
List<String> importants;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public Map<String, String> getBasicInfos() {
return basicInfos;
}
public void setBasicInfos(Map<String, String> basicInfos) {
this.basicInfos = basicInfos;
}
public List<String> getEducations() {
return educations;
}
public void setEducations(List<String> educations) {
this.educations = educations;
}
public List<String> getJobs() {
return jobs;
}
public void setJobs(List<String> jobs) {
this.jobs = jobs;
}
public List<String> getImportants() {
return importants;
}
public void setImportants(List<String> importants) {
this.importants = importants;
}
}
?
?
?
采集器:
?
package org.apdplat.demo.collect;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class PersonCollector{
private static final Logger LOG = LoggerFactory.getLogger(PersonCollector.class);
private static final int PAGES = 298;
public List<Person> collect() {
List<Person> persons = new ArrayList<>();
try {
String url = "http://renwu.hexun.com/search.aspx?z=All&Filter=All&page=";
//共298页
for(int i=1; i<PAGES+1; i++){
url += i;
Document document = Jsoup.connect(url).get();
String cssQuery = "html body div.wrap div.mainBox div.main div.contBox div.cont div.slistBox ul li a";
LOG.debug("cssQuery: " + cssQuery);
Elements elements = document.select(cssQuery);
for(Element element : elements){
try{
String personName = element.text().replace(Jsoup.parse(" ").text(), " ").replace(Jsoup.parse("?").text(), "·");
LOG.debug("人物姓名:"+personName);
String href = element.attr("href");
LOG.debug("人物链接:"+href);
document = Jsoup.connect(href).get();
//基本信息
String basicInfoCSSQuery = "html body div.wrap div.mainBox div.main div.setBase div.right ul li";
LOG.debug("basicInfoCSSQuery: " + basicInfoCSSQuery);
Elements basicElements = document.select(basicInfoCSSQuery);
Map<String, String> basicInfos = new HashMap<>();
for(Element basicElement : basicElements){
String info = basicElement.text().replace(Jsoup.parse(" ").text(), " ").replace(Jsoup.parse("?").text(), "·");
if(info != null){
String[] attrs = info.split(":");
