During the National Day, I want to count the online population of bilibilibili website and write a simple crawler program. This paper mainly analyses the parameters returned by https://api.bilibili.com/x/web-interface/online to get the corresponding position of the number of people online. The key point of the program is the use of requests module.

Specific steps

1. Searching for Data Element Corresponding Interface in Web Page

2. Setting up Agent Library

3. Request interface, json reads data to get the current number of people online

Average 4 and 10 times

5. Configure timed tasks and execute them every minute

6. excel Export Chart

Final results

  

# !/usr/bin/env python3
# -*- coding: utf-8 -*-

import requests
import random
import json
import time

# ---------------------------------------------------------------------------------------
# Calculating time difference,format: Minutes and seconds
def gettimediff(start, end):
    seconds = (end - start).seconds
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    diff = ("%02d:%02d:%02d" % (h, m, s))
    return diff


# ----------------------------------------------------------------------------------------------------------------------
# Returns a random request header headers
def getheaders():
    user_agent_list = [ \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
    UserAgent = random.choice(user_agent_list)
    headers = {'User-Agent': UserAgent}
    return headers


# -----------------------------------------------------inspect ip Availability---------------------
def checkip(targeturl, ip):
    headers = getheaders()  # Customized request header
    proxies = {ip.split(':')[0]: ip}  # agent ip
    try:
        response = requests.get(url=targeturl, proxies=proxies, headers=headers, timeout=5).status_code
        if response == 200:
            return True
        else:
            return False
    except:
        return False


# -------------------------------------------------------Getting the proxy method----------------------
def findip():  # ip type,Page number,target url,Deposit ip Path
    url = 'https://api.bilibili.com/x/web-interface/online'  # To configure url
    headers = getheaders()  # Customized request header
    callback = requests.get(url=url, headers=headers, timeout=10).text
    data_json = json.loads(callback)  # json Format read return value
    web_online = data_json['data']['web_online']  # Get online numbers
    play_online = data_json['data']['play_online']
    all_count = data_json['data']['all_count']
    return web_online, play_online, all_count


if __name__ == "__main__":
    num_sum = 0
    play_sum = 0
    count_sum = 0
    time_now = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))  # Get the current time, format%Y-%m-%d %H:%M:%S
    for i in range(10):         # Statistics ten times, average
        num_sum = num_sum + findip()[0]
        play_sum = play_sum + findip()[1]
        count_sum = count_sum + findip()[2]
        time.sleep(1)
    num = num_sum / 10
    play_online = play_sum / 10
    count = count_sum / 10
    with open('online_num.csv', 'a') as f:   # Write file data
        f.write('%s,%s,%s,%s\n' % (time_now, num, play_online, count))
    f.close()