• Java

爬取电影信息使用Elasticsearch检索

爬虫代码源于网络,修改其中的数据库操作为ES数据访问.

实体类:

爬取记录:

@Data
public class Record {
    public static final String TYPE_MOVIE = "Movie";
    public static final String TYPE_COMMENT = "Comment";
    public static final String TYPE_OTHER = "Other";

    public static final int STATUS_UNCRAWLED = 0;
    public static final int STATUS_CRAWLED = 1;
    public static final int STATUS_ERROR = 2;

    private String id;

    private String url;
    private Integer crawled;
    private String type;

}

电影:

@Data
public class Movie {
    private Integer id;

    private String name;
    private String director;
    private String scenarist;

    private String actors;

    private String type;
    private String country;
    private String language;

    private String releaseDate;

    private String runtime;
    private String ratingNum;
    private String tags;
    private String recordId;
    private String url;
    private String subjectId;

    private String summary;

}

评论:

@Data
public class Comment {
    public static final Integer ContentLength = 2000;

    private Integer id;

    private String content;
    private String author;
    private String authorImg;
    private Integer vote;
    private String movie;
    private Integer movieId;
    private String subjectId;
    private String recordId;

}

爬虫服务类:

@Service
public class DoubanService {

    public static final String MAIN_URL = "https://movie.douban.com";
    public static final String BASE_URL = "https://movie.douban.com/subject";
    public static final Integer MAX_COUNT = 10000;
    public static final Integer DEFAULT_DATA_LENGTH = 255;

    @Autowired
    private EsDao esDao;

    private Logger LOG = LoggerFactory.getLogger("CommonLog");

    public void crawl() {
        //因为想「每次抓取」作为「一个事务」,故循环放到service外面;
        for (int i = 0; i < MAX_COUNT; i++) {
            try {
                sleepAwhile();
                crawlOnePage();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

    public void crawlOnePage() throws Exception {
        Record record = getOneRecordToCrawl();
        String url = record.getUrl();
        LOG.info("Crawling url:" + url);
        try {
            Document doc = Jsoup.connect(url).get();

            //1、抓取该page里是有用href的地址存到record里
            crawlValuableRecordInPage(doc);

            //2、抓取该page里的电影或影评
            if (Record.TYPE_MOVIE.equals(record.getType())) {
                crawlMovieInfo(doc, record);
            } else if (Record.TYPE_COMMENT.equals(record.getType())) {
                crawlCommentInfo(doc, record);
            }
            record.setCrawled(Record.STATUS_CRAWLED);

        } catch (HttpStatusException e) {
            record.setCrawled(Record.STATUS_ERROR);
            LOG.info(e.getMessage());
        } catch (Exception e) {
            newTxnToSave(record);
            LOG.info(e.getMessage());
            throw e;
        }
        esDao.saveRecord(record);
    }

    private void newTxnToSave(Record r) {
        r.setCrawled(Record.STATUS_ERROR);
        esDao.saveRecord(r);
    }

    private Record getOneRecordToCrawl() {
        Record r = esDao.getFirstRecordByStatus(Record.STATUS_UNCRAWLED);
        if (r == null) {
            r = new Record();
            r.setUrl(MAIN_URL);
            r.setCrawled(Record.STATUS_UNCRAWLED);
            r.setType(Record.TYPE_OTHER);
//            esDao.saveRecord(r);
        }
        return r;
    }

    private void crawlValuableRecordInPage(Document doc) {
        Elements hrefs = doc.select("a[href^='" + BASE_URL + "']");
        for (Element e : hrefs) {
            String href = e.attr("href").trim();
            String type = CommonUtil.whichType(href);
            if (type == null || Record.TYPE_OTHER.equals(type) || esDao.getRecordByUrl(href) != null) {
                continue;
            }
            Record record = new Record();
            record.setUrl(href);
            record.setCrawled(Record.STATUS_UNCRAWLED);
            record.setType(type);
            esDao.saveRecord(record);
        }
    }

    private void crawlMovieInfo(Document doc, Record record) {
        Element infoDiv = doc.getElementById("info");
        if (infoDiv == null) {
            return;
        }
        String subjectId = CommonUtil.extractSubjectId(record.getUrl());
        if (esDao.findMovieBySubjectId(subjectId) != null) {
            return;
        }

        Elements subInfos = infoDiv.children();
        Movie movie = new Movie();
        for (Element subInfo : subInfos) {
            if (subInfo.childNodeSize() > 0) {
                String key = subInfo.getElementsByAttributeValue("class", "pl").text();
                if (key == null || "".equals(key)) {
                    continue;
                }
                if ("导演".equals(key)) {
                    String director = subInfo.getElementsByAttributeValue("class", "attrs").text();
                    movie.setDirector(CommonUtil.truncateString(director));
                } else if ("编剧".equals(key)) {
                    movie.setScenarist(CommonUtil.truncateString(subInfo.getElementsByAttributeValue("class", "attrs").text()));
                } else if ("主演".equals(key)) {
                    String actors = subInfo.getElementsByAttributeValue("class", "attrs").text();
                    movie.setActors(CommonUtil.truncateString(actors, 1000));
                }
            }
        }
        Pattern pattern = Pattern.compile("制片国家/地区:</span>(.*?)\n");
        Matcher matcher = pattern.matcher(infoDiv.html());
        if (matcher.find()) {
            movie.setCountry(matcher.group(1).trim());
        }
        pattern = pattern.compile("语言:</span>(.*?)\n");
        matcher = pattern.matcher(infoDiv.html());
        if (matcher.find()) {
            movie.setLanguage(matcher.group(1).trim());
        }
        movie.setType(infoDiv.getElementsByAttributeValue("property", "v:genre").text());
        movie.setReleaseDate(infoDiv.getElementsByAttributeValue("property", "v:initialReleaseDate").text());
        movie.setRuntime(infoDiv.getElementsByAttributeValue("property", "v:runtime").text());
        movie.setTags(doc.getElementsByClass("tags-body").text());
        movie.setName(doc.getElementsByAttributeValue("property", "v:itemreviewed").text());
        movie.setRatingNum(doc.getElementsByAttributeValue("property", "v:average").text());
        movie.setSubjectId(subjectId);
        movie.setRecordId(record.getId());
        movie.setSummary(doc.getElementsByAttributeValue("property", "v:summary").text().trim());

        LOG.info("Movie :《" + movie.getName() + "》 Points: " + movie.getRatingNum() + "\n" + "Summary:" + movie.getSummary());
        esDao.saveMovie(movie);
    }

    private void crawlCommentInfo(Document doc, Record record) {
        Element el = doc.getElementById("comments");
        if (el != null) {
            String[] movies = doc.getElementsByTag("h1").text().replace(" ", "").split("短评");
            String movieName = movies[0];

            Elements items = el.select(".comment-item");
            for (Element item : items) {
                if (item.getElementsByClass("fold-bd").size() < 1 && item.children().get(1).getElementsByTag("p").size() > 0) {
                    // to make sure the current item is the comment item rather than other info item      &&      检测fold-bd是查看是否有折叠,如果是折叠的评论则有fold-bd,折叠评论是指账号有异常的
                    Comment comm = new Comment();
                    comm.setMovie(movieName);
                    //对评论内容去除4字节utf-8字符(包括Emoji表情),因为mysql utf-8编码不支持
                    //(另一方式:mysql 改用 utf8mb4)
                    String content = item.children().get(1).getElementsByTag("p").text().trim();
                    content = CommonUtil.delUtf8mb4Chars(CommonUtil.truncateString(content, Comment.ContentLength));
                    comm.setContent(content);//use "comment.children().get(1).text()" can get all commentInfo like "1819 有用 桃桃淘电影 2016-10-29 即便评分再高也完全喜欢不来。我们还是太热衷主题与意义了,以至于忽视了传递主题的方式与合理性。影片为了所谓的人性深度,而刻意设计剧情和人物转折,忽视基本的人物行为轨迹,都非常让人不舒服。喜欢有深度的电影,但希望能以更巧妙的方式讲出来,而不该是现在这样。以及形式上,这不就是舞台搬演么"

                    comm.setVote(Integer.parseInt(item.getElementsByAttributeValue("class", "votes").text()));
                    String author = item.getElementsByAttribute("href").get(2).text();
                    comm.setAuthor(CommonUtil.delUtf8mb4Chars(author));
                    comm.setAuthorImg(item.getElementsByAttribute("href").get(2).attr("href"));
                    comm.setRecordId(record.getId());
                    comm.setSubjectId(CommonUtil.extractSubjectId(record.getUrl()));

                    LOG.info("Comment for 《" + movieName + "》:" + comm.getContent());
                    esDao.saveComment(comm);
                }
            }
        }
    }

    public void deleteAll() {
        esDao.deleteAll();
    }
}

访问Es数据类:

package com.luangeng.crawler;

import com.luangeng.common.Page;
import com.luangeng.common.entity.Comment;
import com.luangeng.common.entity.Movie;
import com.luangeng.common.entity.Record;
import com.luangeng.crawler.util.JacksonUtil;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHits;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.sort.FieldSortBuilder;
import org.elasticsearch.search.sort.SortOrder;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Repository;

import java.io.IOException;
import java.util.List;
import java.util.concurrent.TimeUnit;

@Repository
public class EsDao {

    private static final String TYPE = "doc";

    private static final String INDEX_RECORD = "douban_record";

    private static final String INDEX_MOVIE = "douban_movie";

    private static final String INDEX_COMMENT = "douban_comment";

    @Autowired
    RestHighLevelClient client;// = new RestHighLevelClient(RestClient.builder(new HttpHost("localhost", 9200, "http")));;

    public void saveComment(Comment comment) {
        try {
            IndexRequest request = new IndexRequest(INDEX_COMMENT, TYPE);
            request.source(JacksonUtil.toJson(comment), XContentType.JSON);
            IndexResponse indexResponse = client.index(request, RequestOptions.DEFAULT);

        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public void saveMovie(Movie movie) {
        try {
            IndexRequest request = new IndexRequest(INDEX_MOVIE, TYPE);
            request.source(JacksonUtil.toJson(movie), XContentType.JSON);
            IndexResponse indexResponse = client.index(request, RequestOptions.DEFAULT);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public void saveRecord(Record record) {
        try {
            IndexRequest request;
            if (record.getId() == null) {
                request = new IndexRequest(INDEX_RECORD, TYPE);
            } else {
                request = new IndexRequest(INDEX_RECORD, TYPE, record.getId());
            }
            request.source(JacksonUtil.toJson(record), XContentType.JSON);
            IndexResponse indexResponse = client.index(request, RequestOptions.DEFAULT);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public Record getFirstRecordByStatus(int status) {
        SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
        sourceBuilder.query(QueryBuilders.termQuery("crawled", status));
        sourceBuilder.from(0);
        sourceBuilder.size(1);
        sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));

        SearchRequest searchRequest = new SearchRequest(INDEX_RECORD);
        searchRequest.source(sourceBuilder);
        SearchResponse searchResponse = null;
        try {
            searchResponse = client.search(searchRequest, RequestOptions.DEFAULT);
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
        if (searchResponse == null) {
            return null;
        }
        SearchHits hits = searchResponse.getHits();
        SearchHit[] searchHits = hits.getHits();
        if (searchHits.length == 0) {
            return null;
        }
        Record r = JacksonUtil.jsonToBean(searchHits[0].getSourceAsString(), Record.class);
        r.setId(searchHits[0].getId());
        return r;
    }

    public Record getRecordByUrl(String url) {
        SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
        sourceBuilder.query(QueryBuilders.termQuery("url", url));
        sourceBuilder.from(0);
        sourceBuilder.size(1);
        sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));

        SearchRequest searchRequest = new SearchRequest(INDEX_RECORD);
        searchRequest.source(sourceBuilder);
        SearchResponse searchResponse = null;
        try {
            searchResponse = client.search(searchRequest, RequestOptions.DEFAULT);
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
        SearchHits hits = searchResponse.getHits();
        SearchHit[] searchHits = hits.getHits();
        if (searchHits.length == 0) {
            return null;
        }
        return JacksonUtil.jsonToBean(searchHits[0].getSourceAsString(), Record.class);
    }

    public Movie findMovieBySubjectId(String id) {
        SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
        sourceBuilder.query(QueryBuilders.termQuery("subjectId", id));
        sourceBuilder.from(0);
        sourceBuilder.size(1);
        sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));

        SearchRequest searchRequest = new SearchRequest(INDEX_MOVIE);
        searchRequest.source(sourceBuilder);
        SearchResponse searchResponse = null;
        try {
            searchResponse = client.search(searchRequest, RequestOptions.DEFAULT);
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
        SearchHits hits = searchResponse.getHits();
        SearchHit[] searchHits = hits.getHits();
        if (searchHits.length == 0) {
            return null;
        }
        return JacksonUtil.jsonToBean(searchHits[0].getSourceAsString(), Movie.class);
    }

    public void deleteAll() {
    }

    public SearchHit[] queryRecord(Record record, Page page){
        SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
        if(record.getCrawled()!=null) {
            sourceBuilder.query(QueryBuilders.termQuery("crawled", record.getCrawled()));
        }
        sourceBuilder.from((page.getPage()-1)*page.getSize());
        sourceBuilder.size(page.getSize());
        sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));
        sourceBuilder.sort(new FieldSortBuilder("_id").order(SortOrder.ASC));

        SearchRequest searchRequest = new SearchRequest(INDEX_RECORD);
        searchRequest.source(sourceBuilder);
        SearchResponse searchResponse = null;
        try {
            searchResponse = client.search(searchRequest, RequestOptions.DEFAULT);
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
        SearchHits hits = searchResponse.getHits();
        SearchHit[] searchHits = hits.getHits();
        return searchHits;
    }

}

爬取3小时后数量:

pic


相关

最新