Sie sind auf Seite 1von 3


/usr/bin/env python

from bs4 import BeautifulSoup

import requests
import shutil
import sys
import argparse

def get_arguments():
parser = argparse.ArgumeasdasasntParser(
description='A Scribd-Downloader that actually works')

help='scribd document to download')
help="download document made up of images",

return parser.parse_args()

# fix encoding issues in python2

def fix_encoding(query):
if sys.version_info > (3, 0):
return query
return query.encode('utf-8')

def save_image(jsonp, imagename):

replacement = jsonp.replace('/pages/', '/images/').replace('jsonp', 'jpg')
response = requests.get(replacement, stream=True)

with open(imagename, 'wb') as out_file:

shutil.copyfileobj(response.raw, out_file)

def save_text(jsonp, filename):

response = requests.get(url=jsonp).text
page_no = response[11:12]

response_head = (
response).replace('' + page_no + '_callback(["',
'').replace('\\n', '').replace('\\', '').replace(
'"]);', '')
soup_content = BeautifulSoup(response_head, 'html.parser')

for x in soup_content.find_all('span', {'class': 'a'}):

xtext = fix_encoding(x.get_text())

extraction = xtext + '\n'

with open(filename, 'a') as feed:

# detect image and text

def save_content(jsonp, images, train, title):
if not jsonp == '':
if images:
imagename = title + '_' + str(train) + '.jpg'
print('Downloading image to ' + imagename)
save_image(jsonp, imagename)
save_text(jsonp, (title + '.txt'))
train += 1

return train

def sanitize_title(title):
'''Remove forbidden characters from title that will prevent OS from creating
directory. (For Windows at least.)
Also change ' ' to '_' to preserve previous behavior.'''

forbidden_chars = " *\"/\<>:|"

replace_char = "_"

for ch in forbidden_chars:
title = title.replace(ch, replace_char)

return title

# the main function

def get_scribd_document(url, images):
response = requests.get(url=url).text
soup = BeautifulSoup(response, 'html.parser')

title = soup.find('title').get_text()#.replace(' ', '_')

title = sanitize_title(title) # a bit more thorough

if not images:
print('Extracting text to ' + title + '.txt\n')

print(title + '\n')

js_text = soup.find_all('script', type='text/javascript')

train = 1

for opening in js_text:

for inner_opening in opening:

portion1 = inner_opening.find('https://')

if not portion1 == -1:

portion2 = inner_opening.find('.jsonp')
jsonp = inner_opening[portion1:portion2+6]

train = save_content(jsonp, images, train, title)

def command_line():
args = get_arguments()
url = args.doc
images = args.images
get_scribd_document(url, images)

if __name__ == '__main__':


Das könnte Ihnen auch gefallen