爬虫代码源于网络,修改其中的数据库操作为ES数据访问.
实体类:
爬取记录:
@Data
public class Record {
public static final String TYPE_MOVIE = "Movie";
public static final String TYPE_COMMENT = "Comment";
public static final String TYPE_OTHER = "Other";
public static final int STATUS_UNCRAWLED = 0;
public static final int STATUS_CRAWLED = 1;
public static final int STATUS_ERROR = 2;
private String id;
private String url;
private Integer crawled;
private String type;
}
电影:
@Data
public class Movie {
private Integer id;
private String name;
private String director;
private String scenarist;
private String actors;
private String type;
private String country;
private String language;
private String releaseDate;
private String runtime;
private String ratingNum;
private String tags;
private String recordId;
private String url;
private String subjectId;
private String summary;
}
评论:
@Data
public class Comment {
public static final Integer ContentLength = 2000;
private Integer id;
private String content;
private String author;
private String authorImg;
private Integer vote;
private String movie;
private Integer movieId;
private String subjectId;
private String recordId;
}
爬虫服务类:
@Service
public class DoubanService {
public static final String MAIN_URL = "https://movie.douban.com";
public static final String BASE_URL = "https://movie.douban.com/subject";
public static final Integer MAX_COUNT = 10000;
public static final Integer DEFAULT_DATA_LENGTH = 255;
@Autowired
private EsDao esDao;
private Logger LOG = LoggerFactory.getLogger("CommonLog");
public void crawl() {
//因为想「每次抓取」作为「一个事务」,故循环放到service外面;
for (int i = 0; i < MAX_COUNT; i++) {
try {
sleepAwhile();
crawlOnePage();
} catch (Exception e) {
e.printStackTrace();
}
}
}
public void crawlOnePage() throws Exception {
Record record = getOneRecordToCrawl();
String url = record.getUrl();
LOG.info("Crawling url:" + url);
try {
Document doc = Jsoup.connect(url).get();
//1、抓取该page里是有用href的地址存到record里
crawlValuableRecordInPage(doc);
//2、抓取该page里的电影或影评
if (Record.TYPE_MOVIE.equals(record.getType())) {
crawlMovieInfo(doc, record);
} else if (Record.TYPE_COMMENT.equals(record.getType())) {
crawlCommentInfo(doc, record);
}
record.setCrawled(Record.STATUS_CRAWLED);
} catch (HttpStatusException e) {
record.setCrawled(Record.STATUS_ERROR);
LOG.info(e.getMessage());
} catch (Exception e) {
newTxnToSave(record);
LOG.info(e.getMessage());
throw e;
}
esDao.saveRecord(record);
}
private void newTxnToSave(Record r) {
r.setCrawled(Record.STATUS_ERROR);
esDao.saveRecord(r);
}
private Record getOneRecordToCrawl() {
Record r = esDao.getFirstRecordByStatus(Record.STATUS_UNCRAWLED);
if (r == null) {
r = new Record();
r.setUrl(MAIN_URL);
r.setCrawled(Record.STATUS_UNCRAWLED);
r.setType(Record.TYPE_OTHER);
// esDao.saveRecord(r);
}
return r;
}
private void crawlValuableRecordInPage(Document doc) {
Elements hrefs = doc.select("a[href^='" + BASE_URL + "']");
for (Element e : hrefs) {
String href = e.attr("href").trim();
String type = CommonUtil.whichType(href);
if (type == null || Record.TYPE_OTHER.equals(type) || esDao.getRecordByUrl(href) != null) {
continue;
}
Record record = new Record();
record.setUrl(href);
record.setCrawled(Record.STATUS_UNCRAWLED);
record.setType(type);
esDao.saveRecord(record);
}
}
private void crawlMovieInfo(Document doc, Record record) {
Element infoDiv = doc.getElementById("info");
if (infoDiv == null) {
return;
}
String subjectId = CommonUtil.extractSubjectId(record.getUrl());
if (esDao.findMovieBySubjectId(subjectId) != null) {
return;
}
Elements subInfos = infoDiv.children();
Movie movie = new Movie();
for (Element subInfo : subInfos) {
if (subInfo.childNodeSize() > 0) {
String key = subInfo.getElementsByAttributeValue("class", "pl").text();
if (key == null || "".equals(key)) {
continue;
}
if ("导演".equals(key)) {
String director = subInfo.getElementsByAttributeValue("class", "attrs").text();
movie.setDirector(CommonUtil.truncateString(director));
} else if ("编剧".equals(key)) {
movie.setScenarist(CommonUtil.truncateString(subInfo.getElementsByAttributeValue("class", "attrs").text()));
} else if ("主演".equals(key)) {
String actors = subInfo.getElementsByAttributeValue("class", "attrs").text();
movie.setActors(CommonUtil.truncateString(actors, 1000));
}
}
}
Pattern pattern = Pattern.compile("制片国家/地区:</span>(.*?)\n");
Matcher matcher = pattern.matcher(infoDiv.html());
if (matcher.find()) {
movie.setCountry(matcher.group(1).trim());
}
pattern = pattern.compile("语言:</span>(.*?)\n");
matcher = pattern.matcher(infoDiv.html());
if (matcher.find()) {
movie.setLanguage(matcher.group(1).trim());
}
movie.setType(infoDiv.getElementsByAttributeValue("property", "v:genre").text());
movie.setReleaseDate(infoDiv.getElementsByAttributeValue("property", "v:initialReleaseDate").text());
movie.setRuntime(infoDiv.getElementsByAttributeValue("property", "v:runtime").text());
movie.setTags(doc.getElementsByClass("tags-body").text());
movie.setName(doc.getElementsByAttributeValue("property", "v:itemreviewed").text());
movie.setRatingNum(doc.getElementsByAttributeValue("property", "v:average").text());
movie.setSubjectId(subjectId);
movie.setRecordId(record.getId());
movie.setSummary(doc.getElementsByAttributeValue("property", "v:summary").text().trim());
LOG.info("Movie :《" + movie.getName() + "》 Points: " + movie.getRatingNum() + "\n" + "Summary:" + movie.getSummary());
esDao.saveMovie(movie);
}
private void crawlCommentInfo(Document doc, Record record) {
Element el = doc.getElementById("comments");
if (el != null) {
String[] movies = doc.getElementsByTag("h1").text().replace(" ", "").split("短评");
String movieName = movies[0];
Elements items = el.select(".comment-item");
for (Element item : items) {
if (item.getElementsByClass("fold-bd").size() < 1 && item.children().get(1).getElementsByTag("p").size() > 0) {
// to make sure the current item is the comment item rather than other info item && 检测fold-bd是查看是否有折叠,如果是折叠的评论则有fold-bd,折叠评论是指账号有异常的
Comment comm = new Comment();
comm.setMovie(movieName);
//对评论内容去除4字节utf-8字符(包括Emoji表情),因为mysql utf-8编码不支持
//(另一方式:mysql 改用 utf8mb4)
String content = item.children().get(1).getElementsByTag("p").text().trim();
content = CommonUtil.delUtf8mb4Chars(CommonUtil.truncateString(content, Comment.ContentLength));
comm.setContent(content);//use "comment.children().get(1).text()" can get all commentInfo like "1819 有用 桃桃淘电影 2016-10-29 即便评分再高也完全喜欢不来。我们还是太热衷主题与意义了,以至于忽视了传递主题的方式与合理性。影片为了所谓的人性深度,而刻意设计剧情和人物转折,忽视基本的人物行为轨迹,都非常让人不舒服。喜欢有深度的电影,但希望能以更巧妙的方式讲出来,而不该是现在这样。以及形式上,这不就是舞台搬演么"
comm.setVote(Integer.parseInt(item.getElementsByAttributeValue("class", "votes").text()));
String author = item.getElementsByAttribute("href").get(2).text();
comm.setAuthor(CommonUtil.delUtf8mb4Chars(author));
comm.setAuthorImg(item.getElementsByAttribute("href").get(2).attr("href"));
comm.setRecordId(record.getId());
comm.setSubjectId(CommonUtil.extractSubjectId(record.getUrl()));
LOG.info("Comment for 《" + movieName + "》:" + comm.getContent());
esDao.saveComment(comm);
}
}
}
}
public void deleteAll() {
esDao.deleteAll();
}
}
访问Es数据类:
package com.luangeng.crawler;
import com.luangeng.common.Page;
import com.luangeng.common.entity.Comment;
import com.luangeng.common.entity.Movie;
import com.luangeng.common.entity.Record;
import com.luangeng.crawler.util.JacksonUtil;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHits;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.sort.FieldSortBuilder;
import org.elasticsearch.search.sort.SortOrder;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Repository;
import java.io.IOException;
import java.util.List;
import java.util.concurrent.TimeUnit;
@Repository
public class EsDao {
private static final String TYPE = "doc";
private static final String INDEX_RECORD = "douban_record";
private static final String INDEX_MOVIE = "douban_movie";
private static final String INDEX_COMMENT = "douban_comment";
@Autowired
RestHighLevelClient client;// = new RestHighLevelClient(RestClient.builder(new HttpHost("localhost", 9200, "http")));;
public void saveComment(Comment comment) {
try {
IndexRequest request = new IndexRequest(INDEX_COMMENT, TYPE);
request.source(JacksonUtil.toJson(comment), XContentType.JSON);
IndexResponse indexResponse = client.index(request, RequestOptions.DEFAULT);
} catch (IOException e) {
e.printStackTrace();
}
}
public void saveMovie(Movie movie) {
try {
IndexRequest request = new IndexRequest(INDEX_MOVIE, TYPE);
request.source(JacksonUtil.toJson(movie), XContentType.JSON);
IndexResponse indexResponse = client.index(request, RequestOptions.DEFAULT);
} catch (IOException e) {
e.printStackTrace();
}
}
public void saveRecord(Record record) {
try {
IndexRequest request;
if (record.getId() == null) {
request = new IndexRequest(INDEX_RECORD, TYPE);
} else {
request = new IndexRequest(INDEX_RECORD, TYPE, record.getId());
}
request.source(JacksonUtil.toJson(record), XContentType.JSON);
IndexResponse indexResponse = client.index(request, RequestOptions.DEFAULT);
} catch (IOException e) {
e.printStackTrace();
}
}
public Record getFirstRecordByStatus(int status) {
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
sourceBuilder.query(QueryBuilders.termQuery("crawled", status));
sourceBuilder.from(0);
sourceBuilder.size(1);
sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));
SearchRequest searchRequest = new SearchRequest(INDEX_RECORD);
searchRequest.source(sourceBuilder);
SearchResponse searchResponse = null;
try {
searchResponse = client.search(searchRequest, RequestOptions.DEFAULT);
} catch (Exception e) {
e.printStackTrace();
return null;
}
if (searchResponse == null) {
return null;
}
SearchHits hits = searchResponse.getHits();
SearchHit[] searchHits = hits.getHits();
if (searchHits.length == 0) {
return null;
}
Record r = JacksonUtil.jsonToBean(searchHits[0].getSourceAsString(), Record.class);
r.setId(searchHits[0].getId());
return r;
}
public Record getRecordByUrl(String url) {
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
sourceBuilder.query(QueryBuilders.termQuery("url", url));
sourceBuilder.from(0);
sourceBuilder.size(1);
sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));
SearchRequest searchRequest = new SearchRequest(INDEX_RECORD);
searchRequest.source(sourceBuilder);
SearchResponse searchResponse = null;
try {
searchResponse = client.search(searchRequest, RequestOptions.DEFAULT);
} catch (Exception e) {
e.printStackTrace();
return null;
}
SearchHits hits = searchResponse.getHits();
SearchHit[] searchHits = hits.getHits();
if (searchHits.length == 0) {
return null;
}
return JacksonUtil.jsonToBean(searchHits[0].getSourceAsString(), Record.class);
}
public Movie findMovieBySubjectId(String id) {
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
sourceBuilder.query(QueryBuilders.termQuery("subjectId", id));
sourceBuilder.from(0);
sourceBuilder.size(1);
sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));
SearchRequest searchRequest = new SearchRequest(INDEX_MOVIE);
searchRequest.source(sourceBuilder);
SearchResponse searchResponse = null;
try {
searchResponse = client.search(searchRequest, RequestOptions.DEFAULT);
} catch (Exception e) {
e.printStackTrace();
return null;
}
SearchHits hits = searchResponse.getHits();
SearchHit[] searchHits = hits.getHits();
if (searchHits.length == 0) {
return null;
}
return JacksonUtil.jsonToBean(searchHits[0].getSourceAsString(), Movie.class);
}
public void deleteAll() {
}
public SearchHit[] queryRecord(Record record, Page page){
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
if(record.getCrawled()!=null) {
sourceBuilder.query(QueryBuilders.termQuery("crawled", record.getCrawled()));
}
sourceBuilder.from((page.getPage()-1)*page.getSize());
sourceBuilder.size(page.getSize());
sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));
sourceBuilder.sort(new FieldSortBuilder("_id").order(SortOrder.ASC));
SearchRequest searchRequest = new SearchRequest(INDEX_RECORD);
searchRequest.source(sourceBuilder);
SearchResponse searchResponse = null;
try {
searchResponse = client.search(searchRequest, RequestOptions.DEFAULT);
} catch (Exception e) {
e.printStackTrace();
return null;
}
SearchHits hits = searchResponse.getHits();
SearchHit[] searchHits = hits.getHits();
return searchHits;
}
}
爬取3小时后数量:
- 作者:luangeng
- 主页:https://wawazhua.cn
- 本文出处:https://wawazhua.cn/post/java/other/elasticsearch-douban/
- 版权声明:禁止转载-非商用-非衍生