Browse Source

more documentation and comments

master
Andreas Demmelbauer 5 years ago
parent
commit
2b485e704e
2 changed files with 13 additions and 9 deletions
  1. +3
    -2
      README.md
  2. +10
    -7
      feedcake.py

+ 3
- 2
README.md View File

@@ -55,9 +55,10 @@ news pages
* add cronjob for `cron.sh`: `crontab -e` * add cronjob for `cron.sh`: `crontab -e`
* `*/5 * * * * /absolute/path/to/cron.sh >> /path/to/logfile 2>&1` * `*/5 * * * * /absolute/path/to/cron.sh >> /path/to/logfile 2>&1`
* setup your webserver: * setup your webserver:
* let your webserver somehow point to the `feeds` directory.
* let your webserver somehow point to the `public/feeds` directory.
You should protect the http path with a basic authentication. You should protect the http path with a basic authentication.
* let the `assets_url` specified in the config point to the `assets` directory.
* let the `assets_url` you specified in the config earlier point to the
`public/assets` directory.
* After running the script the first time, your desired feed is available at * After running the script the first time, your desired feed is available at
`base_url/destination` (e.g. `https://yourdomain.tld/some-url/newspaper.xml`) `base_url/destination` (e.g. `https://yourdomain.tld/some-url/newspaper.xml`)




+ 10
- 7
feedcake.py View File

@@ -32,6 +32,7 @@ except:
print(timestamp(), "ERROR: Config file not found or invalid!") print(timestamp(), "ERROR: Config file not found or invalid!")
sys.exit(1) sys.exit(1)


# File location of feeds and assets are in directories next to the script
public_path = filedir + '/public' public_path = filedir + '/public'
assets_path = public_path + '/assets' assets_path = public_path + '/assets'
feeds_path = public_path + '/feeds' feeds_path = public_path + '/feeds'
@@ -70,18 +71,19 @@ def download_image(url, entry_dir, filename):
for chunk in response.iter_content(1024): for chunk in response.iter_content(1024):
f.write(chunk) f.write(chunk)


# process the feed entries, specified in the config
def process_feed(obj): def process_feed(obj):
feed_url = obj['source'] feed_url = obj['source']
output_filename = obj['destination'] output_filename = obj['destination']


print(timestamp(), 'Updating:', obj['destination']) print(timestamp(), 'Updating:', obj['destination'])


# Get the feed
# Step 1: Get the feed
r_feed = requests.get(feed_url, headers=requestheaders) r_feed = requests.get(feed_url, headers=requestheaders)

# TODO: exceptions.(what if 404 or whatever?) # TODO: exceptions.(what if 404 or whatever?)


# Store data of new articles

# Step 2: Scrape and store data of new articles
for entry in feedparser.parse(r_feed.text).entries: for entry in feedparser.parse(r_feed.text).entries:
entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/ entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/
entry_path = assets_path + '/'+ entry_dir entry_path = assets_path + '/'+ entry_dir
@@ -108,7 +110,7 @@ def process_feed(obj):
content_soup.div.append(article_headline) content_soup.div.append(article_headline)
article_body = online_soup.find('div', attrs={'class': 'story-content'}) article_body = online_soup.find('div', attrs={'class': 'story-content'})
content_soup.div.append(article_body) content_soup.div.append(article_body)
# Add a link to original article # Add a link to original article
article_source = content_soup.new_tag('a', href=entry.link) article_source = content_soup.new_tag('a', href=entry.link)
article_source['class'] = 'source' article_source['class'] = 'source'
@@ -145,7 +147,6 @@ def process_feed(obj):
article_source_p.append(article_source) article_source_p.append(article_source)
content_soup.div.append(article_source_p) content_soup.div.append(article_source_p)



# create directory for storing and serving html and images # create directory for storing and serving html and images
os.makedirs(entry_path) os.makedirs(entry_path)


@@ -178,15 +179,16 @@ def process_feed(obj):


# TODO(?): HTML5 picture tag # TODO(?): HTML5 picture tag


# Save HTML
f = open(entry_path + '/index.html', 'w') f = open(entry_path + '/index.html', 'w')
f.write(str(content_soup)) f.write(str(content_soup))
f.close() f.close()


# Wait a bit
time.sleep(1.3) time.sleep(1.3)




# Create new feed
# Step 3: Create 'new' feed


# Maybe buiding a new feed from scretch using a template would be nicer but ... # Maybe buiding a new feed from scretch using a template would be nicer but ...
# let's just modify the original one! # let's just modify the original one!
@@ -201,6 +203,7 @@ def process_feed(obj):
e.extract() e.extract()
print(timestamp(), 'Exclude: ', e.title.text, '->', matches) print(timestamp(), 'Exclude: ', e.title.text, '->', matches)


# Add content
for e in feed_soup.findAll('item'): for e in feed_soup.findAll('item'):
entry_dir = get_valid_filename(e.link.text) entry_dir = get_valid_filename(e.link.text)
f_content = open(assets_path + '/' + entry_dir + '/index.html', 'r') f_content = open(assets_path + '/' + entry_dir + '/index.html', 'r')


Loading…
Cancel
Save