Ver código fonte

more documentation and comments

master
Andreas Demmelbauer 5 anos atrás
pai
commit
2b485e704e
2 arquivos alterados com 13 adições e 9 exclusões
  1. +3
    -2
      README.md
  2. +10
    -7
      feedcake.py

+ 3
- 2
README.md Ver arquivo

@@ -55,9 +55,10 @@ news pages
* add cronjob for `cron.sh`: `crontab -e`
* `*/5 * * * * /absolute/path/to/cron.sh >> /path/to/logfile 2>&1`
* setup your webserver:
* let your webserver somehow point to the `feeds` directory.
* let your webserver somehow point to the `public/feeds` directory.
You should protect the http path with a basic authentication.
* let the `assets_url` specified in the config point to the `assets` directory.
* let the `assets_url` you specified in the config earlier point to the
`public/assets` directory.
* After running the script the first time, your desired feed is available at
`base_url/destination` (e.g. `https://yourdomain.tld/some-url/newspaper.xml`)



+ 10
- 7
feedcake.py Ver arquivo

@@ -32,6 +32,7 @@ except:
print(timestamp(), "ERROR: Config file not found or invalid!")
sys.exit(1)

# File location of feeds and assets are in directories next to the script
public_path = filedir + '/public'
assets_path = public_path + '/assets'
feeds_path = public_path + '/feeds'
@@ -70,18 +71,19 @@ def download_image(url, entry_dir, filename):
for chunk in response.iter_content(1024):
f.write(chunk)

# process the feed entries, specified in the config
def process_feed(obj):
feed_url = obj['source']
output_filename = obj['destination']

print(timestamp(), 'Updating:', obj['destination'])

# Get the feed
# Step 1: Get the feed
r_feed = requests.get(feed_url, headers=requestheaders)

# TODO: exceptions.(what if 404 or whatever?)

# Store data of new articles

# Step 2: Scrape and store data of new articles
for entry in feedparser.parse(r_feed.text).entries:
entry_dir = get_valid_filename(entry.link) # input e.g. https://orf.at/stories/3117136/
entry_path = assets_path + '/'+ entry_dir
@@ -108,7 +110,7 @@ def process_feed(obj):
content_soup.div.append(article_headline)
article_body = online_soup.find('div', attrs={'class': 'story-content'})
content_soup.div.append(article_body)
# Add a link to original article
article_source = content_soup.new_tag('a', href=entry.link)
article_source['class'] = 'source'
@@ -145,7 +147,6 @@ def process_feed(obj):
article_source_p.append(article_source)
content_soup.div.append(article_source_p)


# create directory for storing and serving html and images
os.makedirs(entry_path)

@@ -178,15 +179,16 @@ def process_feed(obj):

# TODO(?): HTML5 picture tag

# Save HTML
f = open(entry_path + '/index.html', 'w')
f.write(str(content_soup))
f.close()

# Wait a bit
time.sleep(1.3)


# Create new feed
# Step 3: Create 'new' feed

# Maybe buiding a new feed from scretch using a template would be nicer but ...
# let's just modify the original one!
@@ -201,6 +203,7 @@ def process_feed(obj):
e.extract()
print(timestamp(), 'Exclude: ', e.title.text, '->', matches)

# Add content
for e in feed_soup.findAll('item'):
entry_dir = get_valid_filename(e.link.text)
f_content = open(assets_path + '/' + entry_dir + '/index.html', 'r')


Carregando…
Cancelar
Salvar