Hugo博客公告弹窗

haproxy之爬虫拦截规则

    注意:后端服务器监听80端口(HTTP),但前端是HTTPS
    HAProxy将终止SSL,并以HTTP协议与后端通信

新建爬虫文件

nano /etc/haproxy/blacklist-agent.txt

内容

YandexBot
DotBot
SemrushBot
AhrefsBot
BLEXBot
YaK
MJ12bot
MauiBot
MegaIndex.ru
GPTBot
meta-externalagent/1.1
Qwantbot
ClaudeBot
Amazonbot
SearchBot
DataForSeoBot
Barkrowler
GoogleOther
Googlebot

更多爬虫

https://raw.githubusercontent.com/mitchellkrogza/nginx-ultimate-bad-bot-blocker/master/_generator_lists/bad-user-agents.list

配置文件

配置参考: https://github.com/woniu336/open_shell/blob/main/u-haproxy.cfg

global
    log /dev/log local0 warning
    log /dev/log local1 notice
    chroot /var/lib/haproxy
    user haproxy
    group haproxy
    daemon
    maxconn 10000
    stats socket /run/haproxy/admin.sock mode 660 level admin expose-fd listeners
    stats timeout 30s
	
    # SSL会话缓存优化
    tune.ssl.cachesize 50000
    tune.ssl.lifetime 300
    tune.ssl.ssl-ctx-cache-size 1000
    
    # TLS安全全局设置
    ssl-default-bind-ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256
    ssl-default-bind-ciphersuites TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256
    ssl-default-bind-options no-sslv3 no-tlsv10 no-tlsv11 prefer-client-ciphers
	
defaults
    log global
    mode http
    option dontlognull
    option httplog
    option http-keep-alive
    option forwardfor
    timeout connect 5s
    timeout client  30s
    timeout server  30s
    timeout http-keep-alive 15s
    errorfile 400 /etc/haproxy/errors/400.http
    errorfile 403 /etc/haproxy/errors/403.http
    errorfile 408 /etc/haproxy/errors/408.http
    errorfile 500 /etc/haproxy/errors/500.http
    errorfile 502 /etc/haproxy/errors/502.http
    errorfile 503 /etc/haproxy/errors/503.http
    errorfile 504 /etc/haproxy/errors/504.http
    balance roundrobin

frontend http-in
    bind *:80
	
    # ============== 爬虫拦截系统 ==============
    acl empty_ua hdr_cnt(user-agent) eq 0
    acl suspicious_ua hdr_sub(user-agent) -i "python" "curl" "wget"
    acl bad_bots hdr_sub(user-agent) -i -f /etc/haproxy/blacklist-agent.txt
    

    http-request deny if empty_ua
    http-request deny if suspicious_ua
    http-request deny if bad_bots
    # ============== 拦截系统结束 ==============
	
    redirect scheme https code 301 if !{ ssl_fc }

frontend https-in
    bind *:443 ssl crt /etc/haproxy/certs/ alpn h2,http/1.1 allow-0rtt
	
    # ============== HTTPS爬虫拦截系统 ==============
    acl empty_ua hdr_cnt(user-agent) eq 0
    acl suspicious_ua hdr_sub(user-agent) -i "python" "curl" "wget"
    acl bad_bots hdr_sub(user-agent) -i -f /etc/haproxy/blacklist-agent.txt
    

    http-request deny if empty_ua
    http-request deny if suspicious_ua
    http-request deny if bad_bots
    # ============== 拦截系统结束 ==============
    
    # 安全头配置
    http-response set-header Strict-Transport-Security "max-age=31536000; includeSubDomains; preload"
    # http-response set-header X-Frame-Options "SAMEORIGIN"
    # http-response set-header Content-Security-Policy "frame-ancestors 'self'"
    http-response set-header X-Content-Type-Options "nosniff"
    http-response set-header X-XSS-Protection "1; mode=block"
    http-response set-header Referrer-Policy "strict-origin-when-cross-origin"

    # 路由规则
    acl domain1_https hdr(host) -i 123.com
    use_backend backend1 if domain1_https
    # acl domain2_https hdr(host) -i 789.com
    # use_backend backend2 if domain2_https

    # 默认后端
    default_backend backend1

backend backend1
    server server1 8.8.8.8:80 check inter 10s rise 3 fall 3
	
# backend backend2
    # server server2 3.3.3.3:80 check inter 10s rise 3 fall 3

验证

haproxy -c -f /etc/haproxy/haproxy.cfg

重载服务

systemctl restart haproxy

检查状态

systemctl status haproxy

测试 应返回 403 Forbidden

curl -A "GPTBot" http://your-server/

获取真实客户端 ip, 在源站点配置文件添加如下内容

set_real_ip_from 你的反代ip;
real_ip_header X-Forwarded-For;
real_ip_recursive on;
CC BY-NC-SA 4.0 转载请注明
最后更新于 2025-06-04 15:20
clarity统计