爬取思路:
1,在首页上爬取这些推荐博文:https://www.cnblogs.com/
2,根据这些推荐博文进一步到发布这些推荐博文的博主主页中:
3,爬取标签的话可以查看这些博主的标签 只用在博主主页后加一个/tag/就可以跳转到标签页中
4,如果要爬取内容的话,就可以进入这些博主的所有页面中进行爬取
下面是我的代码:
1 package use; 2 3 import java.sql.Connection; 4 import java.sql.PreparedStatement; 5 import java.util.ArrayList; 6 import java.util.Date; 7 import java.util.List; 8 9 import com.dao.ClarifyDao; 10 import com.dao.InfoDao; 11 import org.jsoup.Jsoup; 12 import org.jsoup.nodes.Document; 13 14 import us.codecraft.webmagic.Page; 15 import us.codecraft.webmagic.Site; 16 import us.codecraft.webmagic.Spider; 17 import us.codecraft.webmagic.processor.PageProcessor; 18 19 public class 博客园内容 implements PageProcessor { 20 static int nn=0; 21 static String regEx="[\n`'' ]"; 22 // static String regEx="[\n`~!@#$%^&()+=|{}':;',\\[\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。, 、?? ]"; 23 static String aa = "";//这里是将特殊字符换为aa字符串," "代表直接去掉 24 private static Connection conn = null; 25 26 private static PreparedStatement ps = null; 27 // 标题和链接获取 28 29 private static String TITLEQUERY = "div.post_item_body h3 a.titlelnk"; 30 31 private static String TITLE = "div.post h1 a.postTitle2"; 32 // 作者 33 34 private static String AUTHORQUERY = "div.post_item_foot a.lightblue "; 35 36 37 //初始化带爬取网页地址 38 private static List<String> urls() { 39 List listUrl=new ArrayList<String>(); 40 for(int i=1;i<=200;i++) { 41 listUrl.add("https://www.cnblogs.com/sitehome/p/"+i); 42 43 } 44 listUrl.toArray(new String[listUrl.size()]); 45 return listUrl; 46 } 47 private static void add_urls_child(Page page) { 48 List listUrl=new ArrayList<String>(); 49 listUrl= page.getHtml().xpath("//*[@id=\"post_list\"]//*/div[2]/div/a//@href").all(); 50 51 listUrl.toArray(new String[listUrl.size()]); 52 page.addTargetRequests(listUrl); 53 54 } 55 56 private static void add_urls_child_page(Page page) { 57 List listUrl=new ArrayList<String>(); 58 listUrl= page.getHtml().xpath("//div[@class=\"postTitle\"]/a//@href").all(); 59 60 listUrl.toArray(new String[listUrl.size()]); 61 page.addTargetRequests(listUrl); 62 63 } 64 65 //jsoup根据html字符串和语法来获取内容 66 private static String selectDocumentText(String htmlText,String Query) { 67 Document doc=Jsoup.parse(htmlText); 68 String select=doc.select(Query).text(); 69 return select; 70 } 71 72 //jsoup根据html字符串和语法获取链接地址 73 private static String selectDocumentLink(String htmlText,String Query) { 74 Document doc=Jsoup.parse(htmlText); 75 String select=doc.select(Query).attr("href"); 76 return select; 77 } 78 79 @Override 80 public Site getSite() { 81 return Site.me().setSleepTime(1000).setRetryTimes(10); 82 } 83 84 //编写抽取逻辑 85 @Override 86 public void process(Page page) { 87 nn=nn+1; 88 if(nn==1) 89 { 90 System.out.println("TTTTTTTTTTTTT"); 91 page.addTargetRequests(urls()); 92 } 93 94 String str = page.getUrl().get(); 95 96 if(str.matches("https://www.cnblogs.com/sitehome/p/[0-9]+")) 97 { 98 System.out.println("AAAAA"); 99 add_urls_child(page); 100 } 101 else if(str.matches("https://www.cnblogs.com/[A-Za-z0-9_-]+/")) 102 { 103 System.out.println("BBBBBBB"); 104 add_urls_child_page(page); 105 }else 106 { 107 System.out.println("DDDDDD"); 108 109 String title=page.getHtml().xpath("//*[@id='cb_post_title_url']//text()").get(); 110 111 String URL=page.getUrl().get(); 112 113 114 115 String author=page.getHtml().xpath("//*[@id='Header1_HeaderTitle']//text()").get(); 116 List<String> values=new ArrayList<String>(); 117 values=page.getHtml().xpath("//*[@id='likecs_post_body']//*//text()").all(); 118 String info=""; 119 for(String value:values) 120 { 121 info+=value; 122 } 123 info=info.replaceAll(regEx, aa); 124 System.out.println("Title:\t"+title); 125 System.out.println("AUTHOR:\t"+author); 126 System.out.println( "VALUE:\t"+info); 127 System.out.println("URL:\t"+URL); 128 ClarifyDao.add("blog_info","",title,author,info,URL); 129 130 } 131 132 133 134 135 /* 136 //定义如何抽取页面信息 137 138 List<String> htmls=page.getHtml().xpath("//div[@class='post_item']/html()").all(); 139 140 // List<JavaBokeModel> javaBokes=new ArrayList<JavaBokeModel>(); 141 for(String html:htmls) { 142 // JavaBokeModel javaBoke=new JavaBokeModel(); 143 //标题和链接 144 String title=selectDocumentText(html,TITLEQUERY); 145 146 String linke=selectDocumentLink(html,TITLEQUERY); 147 //作者和作者主页 148 String author=selectDocumentText(html,AUTHORQUERY); 149 150 System.out.println( 151 "TITLE\t"+title+ 152 "Link\t"+linke+ 153 "Author\t"+author 154 ); 155 156 157 158 } 159 */ 160 //File.WriteStringToFile2(javaBokes); 161 162 163 } 164 165 public static void main(String[] args) { 166 long startTime,endTime; 167 //DBUtil.getConnection(); 168 startTime=new Date().getTime(); 169 InfoDao.delete("blog_info"); 170 Spider create=Spider.create(new 博客园内容()); 171 create.addUrl("https://www.cnblogs.com/").thread(5).run(); 172 try { 173 ps.close(); 174 conn.close(); 175 }catch(Exception e) { 176 177 } 178 endTime=new Date().getTime(); 179 System.out.println("用时为:"+(endTime-startTime)/1000+"s"); 180 181 } 182 183 }