创建springboot项目

选择Spring Web,MySQL Driver,Lombok


添加Webmagic依赖

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
<!--WebMagic核心包-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.10.0</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<!--WebMagic扩展-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.10.0</version>
</dependency>
<!--WebMagic对布隆过滤器的支持-->
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>16.0</version>
</dependency>

添加mybatis-plus依赖

1
2
3
4
5
6
7
8
9
10
11
<!--MybatisPlus-->
<dependency>
<groupId>com.baomidou</groupId>
<artifactId>mybatis-plus-boot-starter</artifactId>
<version>3.5.5</version>
</dependency>
<dependency>
<groupId>org.mybatis</groupId>
<artifactId>mybatis-spring</artifactId>
<version>3.0.3</version>
</dependency>

配置application.yml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
server:
port: 8080 # 端口

spring:
main:
allow-circular-references: true # 是否允许循环依赖
datasource:
driver-class-name: com.mysql.cj.jdbc.Driver
url: jdbc:mysql://localhost:3306/{database}?serverTimezone=Asia/Shanghai&useUnicode=true&characterEncoding=utf-8&zeroDateTimeBehavior=convertToNull&useSSL=false&allowPublicKeyRetrieval=true
username: root
password: j


mybatis-plus:
type-aliases-package: com.bangumi_crawler.pojo # 扫描实体类的包名
mapper-locations: "classpath*:/mapper/**/*.xml" # Mapper.xml
configuration:
map-underscore-to-camel-case: false # 是否开启下划线与驼峰映射
cache-enabled: false # 是否开启二级缓存
global-config:
db-config:
update-strategy: not_null # 是否只更新非空字段

手动获取Bean的Utils

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
// utils/BeanUtils.java

@Component
public class BeanUtils implements ApplicationContextAware {
protected static ApplicationContext applicationContext ;

@Override
public void setApplicationContext(ApplicationContext arg0) throws BeansException {
if (applicationContext == null) {
applicationContext = arg0;
}

}
public static Object getBean(String name) {
//name表示其他要注入的注解name名
return applicationContext.getBean(name);
}

/**
* 拿到ApplicationContext对象实例后就可以手动获取Bean的注入实例对象
*/
public static <T> T getBean(Class<T> clazz) {
return applicationContext.getBean(clazz);
}
}

task模板

1
2
3
4
5
6
7
8
9
10
11
12
13
14
// task/Pipeline.java

@Component
public class BangumiPipeline implements Pipeline {

// ResultItems保存了抽取结果,它是一个Map结构,
// 在page.putField(key,value)中保存的数据,
//可以通过ResultItems.get(key)获取
@Override
public void process(ResultItems resultItems, Task task) {

}
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
// task/Processor.java

@Component
public class Processor implements PageProcessor {

@Override
public void process(Page page){
}

private Site site = Site.me()
.setCharset("UTF-8")//编码
.setSleepTime(1)//抓取间隔时间
.setTimeOut(1000*10)//超时时间
.setRetrySleepTime(3000)//重试时间
.setRetryTimes(3);//重试次数


@Override
public Site getSite() {
return site;
}

private String url = "";

@Autowired
private Pipeline pipeline;

@Scheduled(initialDelay = 1000,fixedDelay = 100*1000)
public void process(){
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Spider.create(new Processor())
.addUrl(url) //初始访问url地址
.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(1000000))) //参数设置需要对多少条数据去重
.thread(10) //设置线程数
.setDownloader(httpClientDownloader)
.addPipeline(pipeline)
.run();
}
}

启动类开启定时任务

在application上添加@EnableScheduling注解


完结撒花