Web Scraping

import requests
from bs4 import BeautifulSoup

response = requests.get("https://stackoverflow.com/questions")
soup = BeautifulSoup(response.text, "html.parser")

questions = soup.select(".question-summary")
print(questions[0].attrs)
print(questions[0].get("id", 0))

for question in questions:
    print(question.select_one(".question-hyperlink").getText())
    print(question.select_one(".vote-count-post").getText())

{‘class’: [‘question-summary’], ‘id’: ‘question-summary-67532680’}
question-summary-67532680
GKE Nginx Ingress Controller Oauth2 Proxy redirect
0
Workaround on nested async completion blocks from network calls? Without using PromiseKit
0
ax.text not being printed when using transform
0
Using html & javascript Populate textbox
0
How to pass JSON data through the Django backend to frontend view using Angular
0
Gitlab CI cannot pull private registry with DOCKER_AUTH_CONFIG
0
Shopify SKU Lookup using GraphQL
0

Browser Automation

自动打开chrome，跳转到github.com，登录。

from selenium import webdriver
import time

browser = webdriver.Chrome()
browser.set_window_size(1920, 1080)      # 设置窗口大小

browser.get("https://github.com")

signin_link = browser.find_element_by_link_text("Sign in")
signin_link.click()

username_box = browser.find_element_by_id("login_field")
username_box.send_keys("PurpleMStone")
password_box = browser.find_element_by_id("password")
password_box.send_keys("zq15071452268")
password_box.submit()

assert "PurpleMStone" in browser.page_source

time.sleep(0.5)

# profile_link = browser.find_element_by_class_name("user-profile-link")
# link_label = profile_link.get_attribute("innerHTML")
# assert "PurpleMStone" in link_label

browser.quit()

Working with PDFs

Demo 1

import PyPDF2

with open("first.pdf", "rb") as file:
    reader = PyPDF2.PdfFileReader(file)
    print(reader.numPages)
    page = reader.getPage(0)        # 获取第一页
    page.rotateClockwise(90)        # 页面旋转90度
    
    writer = PyPDF2.PdfFileWriter()
    writer.addPage(page)
    with open("rotated.pdf", "wb") as output:
        writer.write(output)

Demo 2: 合并PDF

import PyPDF2

merger = PyPDF2.PdfFileMerger()
file_names = ["first.pdf", "second.pdf"]
for file_name in file_names:
    merger.append(file_name)
merger.write("combined.pdf")

Excel Spreadsheets

import openpyxl

wb = openpyxl.load_workbook("transactions.xlsx")
print(wb.sheetnames)

sheet = wb["Sheet1"]

# wb.create_sheet(["Sheet2", 0])
# wb.remove_sheet(sheet)

cell = sheet["a1"]
# print(cell.row)
# print(cell.column)
# print(cell.coordinate)
# cell = sheet.cell(row=1, column=1)

print(sheet.max_row)
print(sheet.max_column)

for row in range(1, sheet.max_row + 1):
    for column in range(1, sheet.max_column + 1):
        cell = sheet.cell(row, column)
        # print(cell.value)

column = sheet["a"]
print(column)
cells = sheet["a:c"]    # column
print(cells)

print(sheet[1:3])       # row

sheet.append([1, 2, 3])  # add a row at the end of the sheet
# sheet.insert_rows insert_columns, delete_rows, delete_columns

wb.save("transaction2.xlsx")

Numpy

import numpy as np

array = np.array([[1, 2, 3], [4, 5, 6]])
print(array)
print(type(np.array))
print(array.shape)

array = np.zeros((3, 4), dtype=int)
print(array)

array = np.ones((3, 4), dtype=int)
print(array)

array = np.full((3, 4), 5, dtype=int)
print(array)

array = np.random.random((3, 4))
print(array)
print(array[0, 0])

print(array > 0.2)

print(array[array > 0.2])

print(np.sum(array))
print(np.floor(array))
print(np.ceil(array))
print(np.round(array))

1
2
3

first = np.array([1, 2, 3])
second = np.array([1, 2, 3])
print(first + second)