functions for scrapping
This commit is contained in:
parent
7a9e8339d2
commit
3c5f059759
@ -1,4 +1,3 @@
|
||||
|
||||
Community Currencies and DEX Multitudes
|
||||
#########################################
|
||||
|
||||
|
105
scrape.py
105
scrape.py
@ -1,21 +1,102 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
|
||||
research = "https://www.grassrootseconomics.org/research"
|
||||
html_text = requests.get(research).text
|
||||
soup = BeautifulSoup(html_text, 'html.parser')
|
||||
index_page = "https://www.grassrootseconomics.org/post/claims-and-currencies"
|
||||
html_text = requests.get(index_page).text
|
||||
soup = BeautifulSoup(html_text, 'lxml')
|
||||
|
||||
def findwriter(soup):
|
||||
authors = soup.find_all('span', class_='iYG_V user-name _4AzY3')
|
||||
for author in authors:
|
||||
tag = author.text
|
||||
out = ":author: "
|
||||
strauth = out + tag
|
||||
print(strauth)
|
||||
|
||||
# findwriter(soup)
|
||||
|
||||
def findtime(soup):
|
||||
times = soup.find_all('span', class_='post-metadata__date time-ago')
|
||||
for time in times:
|
||||
tag = time.text
|
||||
out = ":date: "
|
||||
strauth = out + tag
|
||||
print(strauth)
|
||||
|
||||
# findtime(soup)
|
||||
|
||||
def findtags(soup):
|
||||
listtags = soup.find_all('li', class_='_3uJTw')
|
||||
out = ":tags: "
|
||||
apptags = []
|
||||
for lists in listtags:
|
||||
tags = lists.text
|
||||
apptags.append(tags)
|
||||
if len(apptags) > 1:
|
||||
newstr = ",".join(apptags)
|
||||
strout = out + newstr
|
||||
print(strout)
|
||||
else:
|
||||
newstr = apptags[0]
|
||||
strout = out + newstr
|
||||
print(strout)
|
||||
|
||||
# findtags(soup)
|
||||
|
||||
def findmodified(soup):
|
||||
try:
|
||||
updated = soup.find('p', class_="_2aGvg _1AZWZ")
|
||||
out = ":modified: "
|
||||
for update in updated:
|
||||
uptime = update.span
|
||||
modified = uptime.text
|
||||
modified = modified.replace('Updated:', '')
|
||||
strout = out + modified
|
||||
print(strout)
|
||||
except:
|
||||
print("no such class for modified date")
|
||||
|
||||
# findmodified(soup)
|
||||
|
||||
def findtitle(soup):
|
||||
title = soup.find('span', class_='blog-post-title-font blog-post-title-color')
|
||||
out = ':title: '
|
||||
titletext = title.text
|
||||
newtitle = out + titletext
|
||||
return newtitle, titletext
|
||||
|
||||
tagtitle, text = findtitle(soup)
|
||||
|
||||
def findslug(title):
|
||||
words = title.replace(',','').replace("'",'').replace(":", '').replace("(",'').replace(")",'')
|
||||
words = words.split()
|
||||
first = words[0]
|
||||
second = words[1]
|
||||
slug = first + "-" + second
|
||||
slug = slug.lower()
|
||||
print(slug)
|
||||
# findslug(text)
|
||||
|
||||
def filtercontent(soup):
|
||||
maincontent = soup.find('div', id="content-wrapper")
|
||||
paragraphs = maincontent.find_all('p')
|
||||
for par in paragraphs:
|
||||
print(par.prettify())
|
||||
# print(maincontent.prettify())
|
||||
|
||||
filtercontent(soup)
|
||||
# print(soup.find_all(id=True))
|
||||
# for tag in soup.find_all(True):
|
||||
# print(tag.name)
|
||||
def head_of_articles(soup):
|
||||
file = open("ge-theme/static/scrapped-text/reseasrch/article-head.txt",'a+')
|
||||
for match in soup.find_all('div', class_='s_usaAWRichTextClickableSkin_richTextContainer'):
|
||||
# print(match.p.text)
|
||||
for words in match.find_all('em'):
|
||||
text = words.text
|
||||
file.write(text + "\n")
|
||||
|
||||
head_of_articles(soup)
|
||||
# def head_of_articles(soup):
|
||||
# file = open("ge-theme/static/scrapped-text/reseasrch/article-head.txt",'a+')
|
||||
# for match in soup.find_all('div', class_='s_usaAWRichTextClickableSkin_richTextContainer'):
|
||||
# # print(match.p.text)
|
||||
# for words in match.find_all('em'):
|
||||
# text = words.text
|
||||
# file.write(text + "\n")
|
||||
#
|
||||
# head_of_articles(soup)
|
||||
# print(isinstance(head_of_articles(soup), list))
|
||||
|
||||
# for match in soup.find_all('div', class_='s_usaAWRichTextClickableSkin_richTextContainer'):
|
||||
|
Loading…
Reference in New Issue
Block a user