| @@ -96,7 +96,7 @@ def process_feed(feed_url, output_filename): | |||||
| online_soup = BeautifulSoup(r.text, 'html.parser') | online_soup = BeautifulSoup(r.text, 'html.parser') | ||||
| content_soup = BeautifulSoup('<article></article>', 'html.parser') | |||||
| content_soup = BeautifulSoup('<div></div>', 'html.parser') | |||||
| # Remove all Comments | # Remove all Comments | ||||
| for element in online_soup(text=lambda text: isinstance(text, Comment)): | for element in online_soup(text=lambda text: isinstance(text, Comment)): | ||||
| @@ -108,41 +108,41 @@ def process_feed(feed_url, output_filename): | |||||
| if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'): | if entry.link.startswith('https://or'+'f.a'+'t/sto'+'ries'): | ||||
| if entry.date: | if entry.date: | ||||
| article_time = content_soup.new_tag('time', datetime=entry.date) | article_time = content_soup.new_tag('time', datetime=entry.date) | ||||
| content_soup.article.append(article_time) | |||||
| content_soup.div.append(article_time) | |||||
| article_headline = online_soup.find('h1', attrs={'class': 'story-lead-headline'}) | article_headline = online_soup.find('h1', attrs={'class': 'story-lead-headline'}) | ||||
| content_soup.article.append(article_headline) | |||||
| content_soup.div.append(article_headline) | |||||
| article_body = online_soup.find('div', attrs={'class': 'story-content'}) | article_body = online_soup.find('div', attrs={'class': 'story-content'}) | ||||
| content_soup.article.append(article_body) | |||||
| content_soup.div.append(article_body) | |||||
| article_link = content_soup.new_tag('a', href=entry.link) | article_link = content_soup.new_tag('a', href=entry.link) | ||||
| article_link['class'] = 'source'; | |||||
| article_link['class'] = 'source' | |||||
| article_link.string = 'Quelle (' + entry.link + ')' | article_link.string = 'Quelle (' + entry.link + ')' | ||||
| content_soup.article.append(article_link) | |||||
| content_soup.div.append(article_link) | |||||
| if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :) | if entry.link.startswith('https://de'+'rst'+'and'+'ard'+'.a'+'t/20'): # url starts with number ... too lazy for regex :) | ||||
| if entry.published: | if entry.published: | ||||
| article_time = content_soup.new_tag('time', datetime=entry.published) | article_time = content_soup.new_tag('time', datetime=entry.published) | ||||
| content_soup.article.append(article_time) | |||||
| content_soup.div.append(article_time) | |||||
| article_headline = online_soup.find('h1', attrs={'itemprop': 'headline'}) | article_headline = online_soup.find('h1', attrs={'itemprop': 'headline'}) | ||||
| content_soup.article.append(article_headline) | |||||
| content_soup.div.append(article_headline) | |||||
| # images etc | # images etc | ||||
| article_aside = online_soup.find('div', id="content-aside") | article_aside = online_soup.find('div', id="content-aside") | ||||
| content_soup.article.append(article_aside) | |||||
| content_soup.div.append(article_aside) | |||||
| article_body = online_soup.find('div', attrs={'itemprop': 'articleBody'}) | article_body = online_soup.find('div', attrs={'itemprop': 'articleBody'}) | ||||
| content_soup.article.append(article_body) | |||||
| article_link = content_soup.new_tag('a', href=entry.link) | |||||
| article_link['class'] = 'source'; | |||||
| article_link.string = 'Quelle (' + entry.link.split('?')[0] + ')' | |||||
| content_soup.article.append(article_link) | |||||
| content_soup.div.append(article_body) | |||||
| # modify original link -> mobile version and comment section | # modify original link -> mobile version and comment section | ||||
| link_to_comments = re.sub(r'(\/\/)', r'\1mobil.',entry.link.split('?')[0]) + '?_viewMode=forum#' | link_to_comments = re.sub(r'(\/\/)', r'\1mobil.',entry.link.split('?')[0]) + '?_viewMode=forum#' | ||||
| article_comments_link = content_soup.new_tag('a', href=link_to_comments) | article_comments_link = content_soup.new_tag('a', href=link_to_comments) | ||||
| article_comments_link['class'] = 'comments'; | |||||
| article_comments_link.sting = 'Kommentare' | |||||
| content_soup.article.append(article_comments_link) | |||||
| article_comments_link['class'] = 'comments' | |||||
| article_comments_p = content_soup.new_tag('p') | |||||
| article_comments_link.string = 'Kommentare' | |||||
| article_comments_p.append(article_comments_link) | |||||
| content_soup.div.append(article_comments_p) | |||||
| article_link.string = 'Quelle (' + entry.link.split('?')[0] + ')' | |||||
| content_soup.article.append(article_link) | |||||
| article_link = content_soup.new_tag('a', href=entry.link.split('?')[0]) | |||||
| article_link['class'] = 'source' | |||||
| article_link.string = 'Quelle: ' + entry.link.split('?')[0] | |||||
| content_soup.div.append(article_link) | |||||
| # create directory for storing and serving html and images | # create directory for storing and serving html and images | ||||
| @@ -150,7 +150,6 @@ def process_feed(feed_url, output_filename): | |||||
| # download all article images and replace image source | # download all article images and replace image source | ||||
| for img in content_soup.findAll('img'): | for img in content_soup.findAll('img'): | ||||
| print(img) | |||||
| if img.get('data-src'): | if img.get('data-src'): | ||||
| old_url = img['data-src'] | old_url = img['data-src'] | ||||
| if not old_url.startswith('data:'): | if not old_url.startswith('data:'): | ||||