日韩黑丝制服一区视频播放|日韩欧美人妻丝袜视频在线观看|九九影院一级蜜桃|亚洲中文在线导航|青草草视频在线观看|婷婷五月色伊人网站|日本一区二区在线|国产AV一二三四区毛片|正在播放久草视频|亚洲色图精品一区

分享

c 語言寫的爬蟲,抓取豆瓣上所有科幻電影

 新華書店好書榜 2015-12-25

先給自己定個目標(biāo),抓取豆瓣上所有的科幻電影的名稱和類型信息,并保存文件。代碼越短越好。下面是我的代碼:

#include<cspider/spider.h>
typedef struct {
  char *getTitle[20];
  char *getDesc[20];
  int size;
} Movie;
//開始入口的 url
char *begin = "www.douban.com/tag/%E7%A7%91%E5%B9%BB/movie";
void p(cspider_t *cspider, char *d, char *url, void *user_data) {
  Movie *movie = (Movie*)malloc(sizeof(Movie));
  char *urls[20];
  int sizeTitle = xpath(d, "http://div[@id='content']/div[@class='grid-16-8 clearfix']/div[@class='article']/div[@class='mod movie-list']/dl/dd/a", movie->getTitle, 20);
  int sizeDesc = xpath(d, "http://div[@id='content']/div[@class='grid-16-8 clearfix']/div[@class='article']/div[@class='mod movie-list']/dl/dd/div[@class='desc']", movie->getDesc, 20);
  int sizeUrl = xpath(d, "http://div[@id='content']/div[@class='grid-16-8 clearfix']/div[@class='article']/div[@class='paginator']/a/@href", urls, 20);
  movie->size = sizeTitle;
  saveString(cspider, (void*)movie, LOCK);
  char *newUrl[sizeUrl];
  int i;
  //拼接成新的 url
  for (i = 0; i < sizeUrl; i++) {
    newUrl[i] = (char*)malloc(sizeof(char) * (strlen(begin) + strlen(urls[i]) + 1));
    strcat(newUrl[i], begin);
    strcat(newUrl[i], urls[i]);
  }
  //添加新 url 到任務(wù)隊(duì)列
  if (movie->size > 0) {
    addUrls(cspider, newUrl, sizeUrl);
  }
  //回收
  freeStrings(newUrl, sizeUrl);
  freeStrings(urls, sizeUrl);
  freeStrings(movie->getTitle, sizeTitle);
  freeStrings(movie->getDesc, sizeDesc);
  free(movie);
}
void s(void *str, void *user_data) {
  Movie *get = (Movie*)str;
  FILE *file = (FILE*)user_data;
  int i;
  for (i = 0; i < get->size; i++) {
    fprintf(file, "名稱:%s\n", get->getTitle[i]);
    fprintf(file, "類別:%s\n", get->getDesc[i]);
  }
}
int main() {
  cspider_t *spider = init_cspider(); 
  char *agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:42.0) Gecko/20100101 Firefox/42.0";
  char *cookie = "bid=s3/yuH5Jd/I; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1450940218%2C%22http%3A%2F%2Fmovie.douban.com%2Ftag%2F%22%5D; _pk_id.100001.8cb4=8196f325b29ea5c3.1444265431.9.1450943478.1449364495.; ll=108288; viewed=1130500_24708145_6433169_4843567_1767120_5318823_1899158_1271597; __utma=30149280.927537245.1446813674.1449139583.1450940286.5; __utmz=30149280.1450940286.5.5.utmcsr=book.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ps=y; ue=965166527@qq.com; push_noty_num=0; push_doumail_num=7; ap=1; _pk_ses.100001.8cb4=*; gr_user_id=5f4ee24f-d7bc-4b0b-9322-ceb1d208ee36; __utmb=30149280.17.10.1450940286; __utmc=30149280; ct=y; as=http://www.douban.com/tag/%E7%A7%91%E5%B9%BB/movie";
  cs_setopt_url(spider, begin);
  cs_setopt_useragent(spider, agent);
  cs_setopt_cookie(spider, cookie);
  FILE *file = fopen("./movies.txt", "wb+");
  //指向自定義的解析函數(shù),和數(shù)據(jù)持久化函數(shù)
  cs_setopt_process(spider, p, NULL);
  cs_setopt_save(spider, s, file);
  return cs_run(spider);
}

一共 60 行左右,還可以。就是得手動回收內(nèi)存,還有字符串處理,這兩點(diǎn)使得代碼比較丑陋,和 python 和 java 沒得比。我用的爬蟲框架是cspider。大家覺得,如果要讓這個爬蟲框架更完善,讓我們用 c 寫爬蟲更爽,還需要實(shí)現(xiàn)什么呢?

    本站是提供個人知識管理的網(wǎng)絡(luò)存儲空間,所有內(nèi)容均由用戶發(fā)布,不代表本站觀點(diǎn)。請注意甄別內(nèi)容中的聯(lián)系方式、誘導(dǎo)購買等信息,謹(jǐn)防詐騙。如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容,請點(diǎn)擊一鍵舉報。
    轉(zhuǎn)藏 分享 獻(xiàn)花(0

    0條評論

    發(fā)表

    請遵守用戶 評論公約

    類似文章 更多