日韩黑丝制服一区视频播放|日韩欧美人妻丝袜视频在线观看|九九影院一级蜜桃|亚洲中文在线导航|青草草视频在线观看|婷婷五月色伊人网站|日本一区二区在线|国产AV一二三四区毛片|正在播放久草视频|亚洲色图精品一区

分享

JAVA多線程網(wǎng)絡(luò)爬蟲的代碼實(shí)現(xiàn)

 huhuwoo 2015-10-28
……
public class MyCrawler {
    public static BDBFrontier visitedFrontier;
    public static BDBFrontier unvisitedFrontier;
    private static int num = 0;  
     
    public MyCrawler() {
        try{
            if(visitedFrontier == null){
                visitedFrontier = new BDBFrontier(CrawlConfig.CRAWL_VISITED_FRONTIER);      //采用Nosql數(shù)據(jù)庫存儲訪問地址方式
                visitedFrontier.clearAll();
            }
            if(unvisitedFrontier == null) {
                unvisitedFrontier = new BDBFrontier(CrawlConfig.CRAWL_UNVISITED_FRONTIER);
                unvisitedFrontier.clearAll();
            }
        }catch(Exception e) {
            e.printStackTrace();
        }
    }
     
    private void initCrawlerWithSeeds(String[] seeds) {
        synchronized (this) {
            try {
                for(int i = 0;i<seeds.length;i++){
                    CrawlUrl url = new CrawlUrl();            //采用berkeleyDB形式
                    url.setOriUrl(seeds[i]);
                    unvisitedFrontier.putUrl(url);
                     
                }
            catch(Exception e) {
                e.printStackTrace();
            }
        }
    }
     
    public  void crawling(String[] seeds, int threadId) {
        try {
            LinkFilter filter = new LinkFilter() {
                @Override
                public boolean accept(String url) {
                    Pattern pattern = Pattern.compile("^((https|http|ftp|rtsp|mms)?://)"
                            "+(([0-9a-z_!~*'().&=+$%-]+: )?[0-9a-z_!~*'().&=+$%-]+@)?"
                            "(([0-9]{1,3}\\.){3}[0-9]{1,3}"
                            "|"
                            "([0-9a-z_!~*'()-]+\\.)*"
                            "([0-9a-z][0-9a-z-]{0,61})?[0-9a-z]\\."
                            "[a-z]{2,6})"
                            "(:[0-9]{1,4})?"
                            "((/?)|"
                            "(/[0-9a-z_!~*'().;?:@&=+$,%#-]+)+/?)$"); 
                    Matcher matcher = pattern.matcher(url);
                    boolean isMatch= matcher.matches();
                    if(isMatch && url.startsWith(CrawlConfig.CRAWL_LIMIT_PATH)) {
                        return true;
                    }
                    else {
                        return false;
                    }
                }
            };
         
             
            initCrawlerWithSeeds(seeds);
             
            //采用berkeleyDB方式存儲
                                
            CrawlUrl visitedCrawlUrl = (CrawlUrl)unvisitedFrontier.getNext();
            //visitedFrontier.putUrl(visitedCrawlUrl);
             
            do{
                System.out.println("線程:" + threadId);
                if(visitedCrawlUrl == null) {
                    continue;
                }
                             
                String visitedUrl = visitedCrawlUrl.getOriUrl();
                if(visitedFrontier.contains(visitedUrl)) {            //同步數(shù)據(jù)
                    visitedCrawlUrl = (CrawlUrl)unvisitedFrontier.getNext();
                    continue;
                }
                 
                visitedFrontier.putUrl(visitedCrawlUrl);
                 
                if(null == visitedUrl || "".equals(visitedUrl.trim())) {   //抓取的地址為空
                    visitedFrontier.putUrl(visitedCrawlUrl);
                    visitedCrawlUrl = (CrawlUrl)unvisitedFrontier.getNext();
                    continue;
                }
                 
                try{
                    RetrievePage.downloadPage(visitedUrl);                    //下載頁面
                    Set<String> links = HtmlParserTool.extractLinks(visitedUrl, filter);
                    for(String link :links) {
                        if(!visitedFrontier.contains(link)
                            &&!unvisitedFrontier.contains(link)    )
                        {
                            CrawlUrl unvisitedCrawlUrl = new CrawlUrl();
                            unvisitedCrawlUrl.setOriUrl(link);
                            unvisitedFrontier.putUrl(unvisitedCrawlUrl);
                        }
                    }
                }catch(ConnectTimeoutException e) {                            //超時繼續(xù)讀下一個地址
                    visitedFrontier.putUrl(visitedCrawlUrl);
                    visitedCrawlUrl = (CrawlUrl)unvisitedFrontier.getNext();
                    num ++;
                    e.printStackTrace();
                    continue;
                }catch(SocketTimeoutException e) {
                    visitedFrontier.putUrl(visitedCrawlUrl);
                    visitedCrawlUrl = (CrawlUrl)unvisitedFrontier.getNext();
                    num ++;
                    e.printStackTrace();
                    continue;
                }
                visitedCrawlUrl = (CrawlUrl)unvisitedFrontier.getNext();
                num ++;
                 
            }while(BDBFrontier.threads >0 && num < 1000);
        }
         
        catch (IOException e) {
            e.printStackTrace();
        }
        catch(Exception e) {
            e.printStackTrace();
        }
    }
     
}

    本站是提供個人知識管理的網(wǎng)絡(luò)存儲空間,所有內(nèi)容均由用戶發(fā)布,不代表本站觀點(diǎn)。請注意甄別內(nèi)容中的聯(lián)系方式、誘導(dǎo)購買等信息,謹(jǐn)防詐騙。如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容,請點(diǎn)擊一鍵舉報。
    轉(zhuǎn)藏 分享 獻(xiàn)花(0

    0條評論

    發(fā)表

    請遵守用戶 評論公約