|
|
@@ -96,7 +96,7 @@ def process_feed(feed_url, output_filename): |
|
|
|
|
|
|
|
online_soup = BeautifulSoup(r.text, 'html.parser') |
|
|
|
|
|
|
|
content_soup = BeautifulSoup('<article></article>', 'html.parser') |
|
|
|
content_soup = BeautifulSoup('<div></div>', 'html.parser') |
|
|
|
|
|
|
|
# Remove all Comments |
|
|
|
for element in online_soup(text=lambda text: isinstance(text, Comment)): |
|
|
@@ -108,41 +108,41 @@ def process_feed(feed_url, output_filename): |
|
|
|
if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'): |
|
|
|
if entry.date: |
|
|
|
article_time = content_soup.new_tag('time', datetime=entry.date) |
|
|
|
content_soup.article.append(article_time) |
|
|
|
content_soup.div.append(article_time) |
|
|
|
article_headline = online_soup.find('h1', attrs={'class': 'story-lead-headline'}) |
|
|
|
content_soup.article.append(article_headline) |
|
|
|
content_soup.div.append(article_headline) |
|
|
|
article_body = online_soup.find('div', attrs={'class': 'story-content'}) |
|
|
|
content_soup.article.append(article_body) |
|
|
|
content_soup.div.append(article_body) |
|
|
|
article_link = content_soup.new_tag('a', href=entry.link) |
|
|
|
article_link['class'] = 'source'; |
|
|
|
article_link['class'] = 'source' |
|
|
|
article_link.string = 'Quelle (' + entry.link + ')' |
|
|
|
content_soup.article.append(article_link) |
|
|
|
content_soup.div.append(article_link) |
|
|
|
|
|
|
|
if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :) |
|
|
|
if entry.published: |
|
|
|
article_time = content_soup.new_tag('time', datetime=entry.published) |
|
|
|
content_soup.article.append(article_time) |
|
|
|
content_soup.div.append(article_time) |
|
|
|
article_headline = online_soup.find('h1', attrs={'itemprop': 'headline'}) |
|
|
|
content_soup.article.append(article_headline) |
|
|
|
content_soup.div.append(article_headline) |
|
|
|
# images etc |
|
|
|
article_aside = online_soup.find('div', id="content-aside") |
|
|
|
content_soup.article.append(article_aside) |
|
|
|
content_soup.div.append(article_aside) |
|
|
|
article_body = online_soup.find('div', attrs={'itemprop': 'articleBody'}) |
|
|
|
content_soup.article.append(article_body) |
|
|
|
article_link = content_soup.new_tag('a', href=entry.link) |
|
|
|
article_link['class'] = 'source'; |
|
|
|
article_link.string = 'Quelle (' + entry.link.split('?')[0] + ')' |
|
|
|
content_soup.article.append(article_link) |
|
|
|
content_soup.div.append(article_body) |
|
|
|
|
|
|
|
# modify original link -> mobile version and comment section |
|
|
|
link_to_comments = re.sub(r'(\/\/)', r'\1mobil.',entry.link.split('?')[0]) + '?_viewMode=forum#' |
|
|
|
article_comments_link = content_soup.new_tag('a', href=link_to_comments) |
|
|
|
article_comments_link['class'] = 'comments'; |
|
|
|
article_comments_link.sting = 'Kommentare' |
|
|
|
content_soup.article.append(article_comments_link) |
|
|
|
article_comments_link['class'] = 'comments' |
|
|
|
article_comments_p = content_soup.new_tag('p') |
|
|
|
article_comments_link.string = 'Kommentare' |
|
|
|
article_comments_p.append(article_comments_link) |
|
|
|
content_soup.div.append(article_comments_p) |
|
|
|
|
|
|
|
article_link.string = 'Quelle (' + entry.link.split('?')[0] + ')' |
|
|
|
content_soup.article.append(article_link) |
|
|
|
article_link = content_soup.new_tag('a', href=entry.link.split('?')[0]) |
|
|
|
article_link['class'] = 'source' |
|
|
|
article_link.string = 'Quelle: ' + entry.link.split('?')[0] |
|
|
|
content_soup.div.append(article_link) |
|
|
|
|
|
|
|
|
|
|
|
# create directory for storing and serving html and images |
|
|
@@ -150,7 +150,6 @@ def process_feed(feed_url, output_filename): |
|
|
|
|
|
|
|
# download all article images and replace image source |
|
|
|
for img in content_soup.findAll('img'): |
|
|
|
print(img) |
|
|
|
if img.get('data-src'): |
|
|
|
old_url = img['data-src'] |
|
|
|
if not old_url.startswith('data:'): |
|
|
|