Java多线程B站爬虫与45万条视频数据,mysql批量写入性能对比,附代码

xiaoxiao2021-02-27  716

恩,萌新刚来,听学长说写博客可以总结梳理自己的知识,所以来试试,自娱自乐,不喜莫喷。目前还是大二狗,学Java半年多,错误很多,望大神指正。

本文涉及:Java多线程,单例模式,爬虫相关技术,MySQL,JDBC,SQL优化

刚学Java还没学sql时写过一个B站爬虫,但是由于自己临时学的sql速度太慢,爬取45万条数据用了四五个小时,速度太慢,最近sql与Java有些进步,优化了一下爬虫,没有再次爬太多视频数据(爬太多B站会封IP),恩,结果是6分钟8万条数据,还算满意,我把我的设计思路分享一下,供萌新们参考,大神轻点打脸

因为限制爬虫速度的第一是网速,第二是数据库,但是他们对硬件占用不冲突,所以我就打算把他们分到不同的线程中:

sql插入优化在另一篇文章中:

核心流程:

step1:爬取数据:因为从网上获取数据有很大的延迟,线程很长时间处于等待状态,对计算机压力不大,所以尽量多开几个线程,我一般开5个

step2:解析并暂存:不管多少个线程,爬取后把json把需要的数据解析成本地的数据类,全部存入一个队列中

step3:数据库写入:数据库只开一个线程,因为写入占用硬盘,限制条件是硬盘速度,多线程意义不大。扫描step2中的队列,如果不为空,从头部取出一个并从队列中移除,然后编入数据库批量写入命令中,达到预定的阈值后一次写入多条。

优点:爬去和写入分开,不会在写入时爬去处于等待状态,系统利用率高

缺点:系统资源占用高

核心算法确定完毕,具体操作如下:

首先用Fiddler4找到接口:

在不断地访问B站后找到了加载视频列表的API

然后访问不同的分区,观察规律,最后总结出获取不同分区视频列表的API函数

public static final int[] categories={22,26,126,127,157,158,164,159,71,137,131,24,25,47,27,33,32,153,51,152,28,31,30,59,29,54,130, 20,154,156,17,65,136,19,121,37,124,122,39,96,95,98,138,21,76,75,161,162,163,22,26,126,127,157,158,164,159,71,137,131}; public static String GetUrl(int categories,int pagenum){ return String.format("http://api.bilibili.com/archive_rank/getarchiverankbypartion?callback=?&type=jsonp&tid=%d&pn=%d&_=?",categories,pagenum); }

categories对应不同的小区(例如鬼畜大区下的鬼畜调教小区对应22,我没发现大区的API,发现的同学请在留言中提到,万分感谢)

懒得找的同学可以用我找到的信息:

switch (category){ case 24:avType.setTypeName("动画--MAD·AMV");avType.setArea("动画");avType.setAreaNum(1);return avType; case 25:avType.setTypeName("动画--MD·3D");avType.setArea("动画");avType.setAreaNum(1);return avType; case 47:avType.setTypeName("动画--短片·手书·配音");avType.setArea("动画");avType.setAreaNum(1);return avType; case 27:avType.setTypeName("动画--综合");avType.setArea("动画");avType.setAreaNum(1);return avType; case 33:avType.setTypeName("番剧--连载动画");avType.setArea("番剧");avType.setAreaNum(2);return avType; case 32:avType.setTypeName("番剧--完结动画");avType.setArea("番剧");avType.setAreaNum(2);return avType; case 153:avType.setTypeName("番剧--国产动画");avType.setArea("番剧");avType.setAreaNum(2);return avType; case 51:avType.setTypeName("番剧--资讯");avType.setArea("番剧");avType.setAreaNum(2);return avType; case 152:avType.setTypeName("番剧--官方延伸");avType.setArea("番剧");avType.setAreaNum(2);return avType; case 28:avType.setTypeName("音乐--原创音乐");avType.setArea("音乐");avType.setAreaNum(3);return avType; case 31:avType.setTypeName("音乐--翻唱");avType.setArea("音乐");avType.setAreaNum(3);return avType; case 30:avType.setTypeName("音乐--VOCALOID·UTAU");avType.setArea("音乐");avType.setAreaNum(3);return avType; case 59:avType.setTypeName("音乐--演奏");avType.setArea("音乐");avType.setAreaNum(3);return avType; case 29:avType.setTypeName("音乐--三次元音乐");avType.setArea("音乐");avType.setAreaNum(3);return avType; case 54:avType.setTypeName("音乐--OP/ED/OST");avType.setArea("音乐");avType.setAreaNum(3);return avType; case 130:avType.setTypeName("音乐--音乐选集");avType.setArea("音乐");avType.setAreaNum(3);return avType; case 20:avType.setTypeName("舞蹈--宅舞");avType.setArea("舞蹈");avType.setAreaNum(4);return avType; case 154:avType.setTypeName("舞蹈--三次元舞蹈");avType.setArea("舞蹈");avType.setAreaNum(4);return avType; case 156:avType.setTypeName("舞蹈--宅舞教程");avType.setArea("舞蹈");avType.setAreaNum(4);return avType; case 17:avType.setTypeName("游戏--单机游戏");avType.setArea("游戏");avType.setAreaNum(5);return avType; case 65:avType.setTypeName("游戏--网友-电竞");avType.setArea("游戏");avType.setAreaNum(5);return avType; case 136:avType.setTypeName("游戏--音游");avType.setArea("游戏");avType.setAreaNum(5);return avType; case 19:avType.setTypeName("游戏--Mugen");avType.setArea("游戏");avType.setAreaNum(5);return avType; case 121:avType.setTypeName("游戏--GWM");avType.setArea("游戏");avType.setAreaNum(5);return avType; case 37:avType.setTypeName("科技--纪录片");avType.setArea("科技");avType.setAreaNum(6);return avType; case 124:avType.setTypeName("科技--趣味科普-人文");avType.setArea("科技");avType.setAreaNum(6);return avType; case 122:avType.setTypeName("科技--野生技术协会");avType.setArea("科技");avType.setAreaNum(6);return avType; case 39:avType.setTypeName("科技--演讲-公开课");avType.setArea("科技");avType.setAreaNum(6);return avType; case 96:avType.setTypeName("科技--星海");avType.setArea("科技");avType.setAreaNum(6);return avType; case 95:avType.setTypeName("科技--数码");avType.setArea("科技");avType.setAreaNum(6);return avType; case 98:avType.setTypeName("科技--机械");avType.setArea("科技");avType.setAreaNum(6);return avType; case 138:avType.setTypeName("生活--搞笑");avType.setArea("生活");avType.setAreaNum(7);return avType; case 21:avType.setTypeName("生活--日常");avType.setArea("生活");avType.setAreaNum(7);return avType; case 76:avType.setTypeName("生活--美食圈");avType.setArea("生活");avType.setAreaNum(7);return avType; case 75:avType.setTypeName("生活--动物圈");avType.setArea("生活");avType.setAreaNum(7);return avType; case 161:avType.setTypeName("生活--手工");avType.setArea("生活");avType.setAreaNum(7);return avType; case 162:avType.setTypeName("生活--绘画");avType.setArea("生活");avType.setAreaNum(7);return avType; case 163:avType.setTypeName("生活--运动");avType.setArea("生活");avType.setAreaNum(7);return avType; case 22:avType.setTypeName("鬼畜--鬼畜调教");avType.setArea("鬼畜");avType.setAreaNum(8);return avType; case 26:avType.setTypeName("鬼畜--音MAD");avType.setArea("鬼畜");avType.setAreaNum(8);return avType; case 126:avType.setTypeName("鬼畜--人力VOCALOID");avType.setArea("鬼畜");avType.setAreaNum(8);return avType; case 127:avType.setTypeName("鬼畜--教程演示");avType.setArea("鬼畜");avType.setAreaNum(8);return avType; case 157:avType.setTypeName("时尚--美妆");avType.setArea("时尚");avType.setAreaNum(9);return avType; case 158:avType.setTypeName("时尚--服饰");avType.setArea("时尚");avType.setAreaNum(9);return avType; case 164:avType.setTypeName("时尚--健身");avType.setArea("时尚");avType.setAreaNum(9);return avType; case 159:avType.setTypeName("时尚--资讯");avType.setArea("时尚");avType.setAreaNum(9);return avType; case 71:avType.setTypeName("娱乐--综艺");avType.setArea("娱乐");avType.setAreaNum(10);return avType; case 137:avType.setTypeName("娱乐--明星");avType.setArea("娱乐");avType.setAreaNum(10);return avType; case 131:avType.setTypeName("娱乐--Korea相关");avType.setArea("娱乐");avType.setAreaNum(10);return avType; default:avType.setTypeName("未知");avType.setArea("未知");avType.setAreaNum(99);return avType; }

pagenum对应视频列表的第几页,每页有多条数据

例如访问

http://api.bilibili.com/archive_rank/getarchiverankbypartion?callback=?&type=jsonp&tid=22&pn=1&_=?

就会返回一个包含鬼畜调教第一页json信息 通过不同的参数访问这个API就会返回包含视频信息的json对象 解析这个json,并去除我们需要的信息保存在ArrayList中,这个就 不多说了,有现有的json解析包 如下就是从网上获取json的run方法

package iss2015302580343.whu.homework_5; import java.util.ArrayList; /** * Created by Tao on 2017/1/6. */ public class WriterThread extends Thread { MySQLAccess mySQLAccess = MySQLAccess.getAccess(); HttpGetter httpGetter = new HttpGetter(); JsonReader jsonReader = new JsonReader(); String json=null; int page; int startNum; int endNum; public WriterThread(int page,int start,int end){ this.page=page;this.startNum=start;this.endNum=end; } @Override public void run() { for (int i = startNum; i < endNum; i++) { for (int j = 1; j <= page; j++) { //j代表page,每个分类读取多少页 //i代表第几个分类,从categories[]中获取 json = httpGetter.GetByString(BILIBILIValues.GetUrl(BILIBILIValues.categories[i], j)); ArrayList<AVMode> avModes = jsonReader.jsonRead(json); if (avModes == null) { continue; } } } } } 然后设计一个可以存储所有信息的数据库,也不多说 为了在不同的地方统一操作,我们把数据库操作的类设计为单例模式并实现runable接口:waitForWrite就是等待写入的ArrayList public void run() { while (true){ System.out.print(""); if(waitForWrite.size()>0){ for (int i=0;i<waitForWrite.size();i++){ try { writeIn(waitForWrite.get(i)); }catch (MySQLIntegrityConstraintViolationException exception){ //跳过重复的 } catch (SQLException e) { e.printStackTrace(); } waitForWrite.remove(i); System.out.println("++++++++++++++++++"); } } } } writeIn的实现为: private void writeIn(ArrayList<AVMode> avModes) throws SQLException { StringBuilder sql= new StringBuilder("INSERT INTO " + MySQLValues.getTablename() +" values "); int i=0; for(;i<avModes.size();i++){ sql.append(" ("); sql.append( avModes.get(i).getAid()+","); sql.append( "'"+ avModes.get(i).getName().replace("'","-").replace("\"","-")+"',"); sql.append( avModes.get(i).getPlay()+","); sql.append( "'"+ avModes.get(i).getAuther().replace("'","-").replace("\"","-")+"',"); sql.append( avModes.get(i).getCategories()+","); sql.append( avModes.get(i).getFavorites()+","); sql.append( avModes.get(i).getCoins()+","); sql.append( avModes.get(i).getDanmaku()+","); sql.append( BILIBILIValues.CategoriesNames(avModes.get(i).getCategories()).getAreaNum()); if(i<avModes.size()-1){ sql.append(" ),"); }else { sql.append(" );"); } } PreparedStatement preparedStatement= connection.prepareStatement(sql.toString()); preparedStatement.execute(); } 每一页多条数据一次写入 ,速度加快了10倍

拿到数据后就可以干很多是了:

例如加权排序:

/** * 按类别查找并排序,排序权重为:40% 硬币数, 40% 收藏数, 20%弹幕数 * @param area 大分类 * @num num 这个类取多少条 * @return */ public ResultSet FindArea(int area, int num){ ResultSet result = null; try { String sql = "SELECT * FROM " + MySQLValues.getTablename()+ " WHERE area = ? ORDER by (favorites * 4 + coins * 4 + danmaku * 2 ) desc"+ " Limit 0,10 ; "; PreparedStatement preparedStatement= connection.prepareStatement(sql); preparedStatement.setString(1, String.valueOf(area)); result = preparedStatement.executeQuery(); } catch (SQLException e) { e.printStackTrace(); } return result; } 鬼畜区试验:

这是新爬的10万条数据的数据库结果:(算完时间忘关了,多爬了些)

这是以前的45万条数据的数据库:

不信的看数据库大小:

最后附上项目代码,lib都在里面https://git.oschina.net/iss2015302580343/bilibilipachong.git

转载请注明原文地址: https://www.6miu.com/read-67.html

最新回复(0)