From 3c5f0597591cd1f34f871039da142db2320d95b4 Mon Sep 17 00:00:00 2001
From: Idaapayo <idaapayo@gmail.com>
Date: Wed, 21 Jul 2021 17:04:17 +0300
Subject: [PATCH] functions for scrapping

---
 content/blog/cic_will.rst |   1 -
 scrape.py                 | 105 +++++++++++++++++++++++++++++++++-----
 2 files changed, 93 insertions(+), 13 deletions(-)

diff --git a/content/blog/cic_will.rst b/content/blog/cic_will.rst
index 42ea3b5..58918f7 100644
--- a/content/blog/cic_will.rst
+++ b/content/blog/cic_will.rst
@@ -1,4 +1,3 @@
-
 Community Currencies and DEX Multitudes
 #########################################
 
diff --git a/scrape.py b/scrape.py
index eea6236..c8f77c8 100644
--- a/scrape.py
+++ b/scrape.py
@@ -1,21 +1,102 @@
 from bs4 import BeautifulSoup
 import requests
 
-research = "https://www.grassrootseconomics.org/research"
-html_text = requests.get(research).text
-soup = BeautifulSoup(html_text, 'html.parser')
+index_page = "https://www.grassrootseconomics.org/post/claims-and-currencies"
+html_text = requests.get(index_page).text
+soup = BeautifulSoup(html_text, 'lxml')
+
+def findwriter(soup):
+    authors = soup.find_all('span', class_='iYG_V user-name _4AzY3')
+    for author in authors:
+        tag = author.text
+        out = ":author: "
+        strauth = out + tag
+        print(strauth)
+
+# findwriter(soup)
+
+def findtime(soup):
+    times = soup.find_all('span', class_='post-metadata__date time-ago')
+    for time in times:
+        tag = time.text
+        out = ":date: "
+        strauth = out + tag
+        print(strauth)
+
+# findtime(soup)
+
+def findtags(soup):
+    listtags = soup.find_all('li', class_='_3uJTw')
+    out = ":tags: "
+    apptags = []
+    for lists in listtags:
+        tags = lists.text
+        apptags.append(tags)
+    if len(apptags) > 1:
+        newstr = ",".join(apptags)
+        strout = out + newstr
+        print(strout)
+    else:
+        newstr = apptags[0]
+        strout = out + newstr
+        print(strout)
+
+# findtags(soup)
+
+def findmodified(soup):
+    try:
+        updated = soup.find('p', class_="_2aGvg _1AZWZ")
+        out = ":modified: "
+        for update in updated:
+            uptime = update.span
+            modified = uptime.text
+            modified = modified.replace('Updated:', '')
+            strout = out + modified
+            print(strout)
+    except:
+        print("no such class for modified date")
+
+# findmodified(soup)
+
+def findtitle(soup):
+    title = soup.find('span', class_='blog-post-title-font blog-post-title-color')
+    out = ':title: '
+    titletext = title.text
+    newtitle = out + titletext
+    return newtitle, titletext
+
+tagtitle, text = findtitle(soup)
+
+def findslug(title):
+    words = title.replace(',','').replace("'",'').replace(":", '').replace("(",'').replace(")",'')
+    words = words.split()
+    first = words[0]
+    second = words[1]
+    slug = first + "-" + second
+    slug = slug.lower()
+    print(slug)
+# findslug(text)
+
+def filtercontent(soup):
+    maincontent = soup.find('div', id="content-wrapper")
+    paragraphs = maincontent.find_all('p')
+    for par in paragraphs:
+        print(par.prettify())
+    # print(maincontent.prettify())
+
+filtercontent(soup)
 # print(soup.find_all(id=True))
 # for tag in soup.find_all(True):
 #     print(tag.name)
-def head_of_articles(soup):
-    file = open("ge-theme/static/scrapped-text/reseasrch/article-head.txt",'a+')
-    for match in soup.find_all('div', class_='s_usaAWRichTextClickableSkin_richTextContainer'):
-        # print(match.p.text)
-        for words in match.find_all('em'):
-            text = words.text
-            file.write(text + "\n")
-
-head_of_articles(soup)
+# def head_of_articles(soup):
+#     file = open("ge-theme/static/scrapped-text/reseasrch/article-head.txt",'a+')
+#     for match in soup.find_all('div', class_='s_usaAWRichTextClickableSkin_richTextContainer'):
+#         # print(match.p.text)
+#         for words in match.find_all('em'):
+#             text = words.text
+#             file.write(text + "\n")
+#
+# head_of_articles(soup)
 # print(isinstance(head_of_articles(soup), list))
 
 # for match in soup.find_all('div', class_='s_usaAWRichTextClickableSkin_richTextContainer'):