"""
-------------------------------------------
[program description]
-------------------------------------------
Author: Shiyu Chen
__updated__="2019-07-04"
-------------------------------------------
"""
# Import
from bs4 import BeautifulSoup
import re
import requests
import csv
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
def get_total_page_number():
req = requests.get("http://securities.stanford.edu/filings.html")
html = req.text
soup = BeautifulSoup(html, 'html.parser')
page = str(soup.find("div", class_="span6"))
pattern = r"\((?P<number>\d+)\)"
object = re.search(pattern, page)
num = int(object.group("number"))
# print(num)
page_number = num // 20 + 1
return page_number
def get_all_lines_in_one_page(pn):
url = url_ + str(pn)
req = requests.get(url)
html = req.text
soup = BeautifulSoup(html, 'html.parser')
# <tr class="table-link" page="filings" onclick="window.location='filings-case.html?id=107058'">
all_line = soup.find_all("tr", class_="table-link", page="filings")
return all_line
def get_basic(one):
pattern_link = r"id=(?P<id>\d*)"
id = (re.search(pattern_link, str(one))).group("id")
link_fix = "http://securities.stanford.edu/filings-case.html?id="
link = link_fix + id
# title------------------
info = one.find_all("td", class_="")
name = info[0].get_text(strip=True)
date = info[1].get_text(strip=True)
court = info[2].get_text(strip=True)
exchange = info[3].get_text(strip=True)
ticker = info[4].get_text(strip=True)
return name, date, court, exchange, ticker, link
def get_summary(link):
req = requests.get(link)
html = req.text
soup = BeautifulSoup(html, 'html.parser')
# <div class="span12" style="background-color: #ffffff;">
summary = str(soup.find("div", class_="span12", style="background-color: #ffffff;"))
summary = re.sub(r"</?(.+?)>", "", summary)
summary = re.sub(r"\s+", " ", summary)
return summary, soup
def get_other(soup):
section = soup.find("section", id="summary")
s_u = section.find("p").get_text().strip()
p_status = r"Case Status:(\W+)(?P<status>\w+)(\W*)On"
status = re.search(p_status, s_u)
if status is None:
status = ""
else:
status = status.group("status")
# update data------------------------------------
p_update = r"On or around .+\)"
update_date = re.search(p_update, s_u)
if update_date is None:
update_date = ""
else:
update_date = update_date.group()
# -------------------------------
filing_date = section.find("p", class_="lead").get_text()
filing_date = re.search(r"Filing Date: (?P<filing_date>.+)", filing_date).group("filing_date")
return status, update_date, filing_date
def get_class_period(soup):
section = soup.find("section", id="fic")
text = section.find_all("div", class_="span4")
start_date = text[4].get_text()
end_date = text[5].get_text()
return start_date, end_date
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
url_ = "http://securities.stanford.edu/filings.html?page="
pn = 1
id = 1
with open("sca.csv", "w") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(
["ID", "Filling Name", "Filing Date", "District Court", "Exchange", "Ticker", "Link", "Case Status",
"Update Date", "Filing Date", "Summary", "Class Period Start", "Class Period End"])
while pn <= get_total_page_number():
all_line = get_all_lines_in_one_page(pn)
for oneline in all_line:
name, date, court, exchange, ticker, link = get_basic(oneline)
summary, soup = get_summary(link)
status, update_date, filing_date = get_other(soup)
start_date, end_date = get_class_period(soup)
# write--------
one = [id, name, date, court, exchange, ticker, link, status, update_date, filing_date, summary, start_date,
end_date]
writer.writerow(one)
id += 1
print(pn)
pn += 1
print("Finish")