View Single Post
Old 07-02-2016, 03:03 AM  
adultmobile
No, I am not banned
 
adultmobile's Avatar
 
Industry Role:
Join Date: Nov 2003
Location: ChatGF.com
Posts: 5,345
Don't use socket but other libs more http-oriented. Also you seem to use python 2.. I use python 3 only since couple of years

I paste below some pieces of code (sorry for mess) that load stuff ok in python 3 also via cloudflare's:

def getSacImage(sacimageid, sacfolder):

# list here all the global variables accessed from getSacImage, or will consider local

sacimageidstr = str(sacimageid)

imageaddr = 'http://site.com/folder/' + sacimageidstr + '.jpg'
# print('imageaddr:' + imageaddr)

# should use WITH opener to be more clean...

opener = None # define it, so in exception case not to give UnboundLocalError: local variable 'opener' referenced before assignment

opener, errormesg = getGeneric(imageaddr)

if opener is None: # there was an error?
return sacimageid, errormesg # error message includes the URL

# End of Try block, urlopen worked if we are here

# detect type and length...
imgtype = opener.headers.get("Content-Type")
imgsize = opener.headers.get("Content-Length")

# print('Content-Type: ' + imgtype + ', Content-Length: ' + imgsize)

# to know if there is a new image or its still old, can compare the last-modified ( + content lenght optionally)
lastmod = opener.headers.get("Last-Modified")
# print('Last-Modified: ' + lastmod) # ex: Tue, 04 Aug 2009 17:47:52 GMT

local_file = None # define so exists if fails on try (or I get UnboundLocalError)
outpath = None # define so exists if fails on try (or I get UnboundLocalError)

# only copy if is a jpg
if (imgtype == 'image/jpeg'):

image_datetime_obj = datetime.strptime(lastmod, '%a, %d %b %Y %H:%M:%S %Z') # parse Last-Modified into time obj
# (note: %Z for time zone name like GMT; would need %z if UTC offset in the form +HHMM or -HHMM )
# note: time.strptime() returns different type than datetime.strptime() !

image_datetime_plusoneday = image_datetime_obj + timedelta(days=1) # create datetime set to image time + 1 day

if image_datetime_plusoneday < datetime.now(): # datetime obj + timedelta obj 1 day less than datetime now?
if opener is not None: opener.close()
return sacimageid,' image older than 24h, too old, skipped'

# prepare date to be ending file name after ID
filedatemod = lastmod[5:] # strip leading day
filedatemod = filedatemod[:-4] #strip ending GMT
filedatemod = filedatemod.replace(" ", "") # no spaces
filedatemod = filedatemod.replace(":", "") # no :
# print(filedatemod)

try:
outpath = os.path.join(sacfolder, sacimageidstr + '_' + filedatemod + '.jpg')
#print('path: ' + outpath)
except Exception as e:
print('Could not do OS path ', outpath, e.__class__, e) # print in shell
# print('Could not do OS path', outpath, e.__class__, e, file=mylogz) # print in log file if fail
if opener is not None: opener.close()
return sacimageid,' was a local file path error'

if(os.path.isfile(outpath)): # if file exists with same date... no need to re-download
if opener is not None: opener.close()
return sacimageid,' already got the file downloaded before' # not modified since

try:
#print("downloading " + imageaddr)
local_file = open(outpath, 'wb')
except Exception as e:
print('Could not open ', outpath, e.__class__, e) # print in shell
# print('Could not open', outpath, e.__class__, e, file=mylogz) # print in log file if fail
if local_file is not None: local_file.close()
if opener is not None: opener.close()
return sacimageid,' was a local file open error'

try:
local_file.write(opener.read())
except Exception as e:
print('Could not read from web or write to local file: ', outpath, e.__class__, e) # print in shell
if local_file is not None: local_file.close()
if opener is not None: opener.close()
return sacimageid,' was a local file write error'

opener.close()
local_file.close()
return sacimageid,' was an image'
#
else: # not an image...
opener.close()
return sacimageid,' not an image (what is that!?)'


*********

def getGeneric(openthisurl): # returns list includeid

# print('openthisurl:' + openthisurl)

opener = None # define it, so in exception case not to give UnboundLocalError: local variable 'opener' referenced before assignment

# https://docs.python.org/3.3/library/...urllib.request
# For http and https urls, this function returns a http.client.HTTPResponse
# https://docs.python.org/3.3/library/...sponse-objects

# without a timeout, at the read() later, it can keep hanged forever (not even ctrl+c works... on windows)
# The default timeout for urllib2 is infinite
# By default the socket module has no timeout and can hang. Currently, the socket timeout is not
# exposed at the httplib or urllib2 levels. However, you can set the default timeout globally for all sockets
mysockettimeout = 10 # used both in socket.setdefaulttimeout() below and urllib.request.urlopen() later

# To set the socket timeout (global to all functions) is not necessary, enoigh to set a timeout on urlopen ? (to be tested)
# socket.setdefaulttimeout(mysockettimeout) # set the global socket timeout, in seconds, for all users of the socket module

try:
# urllib.request.urlopen(url, data=None[, timeout], *, cafile=None, capath=None, cadefault=True)
# returns http.client.HTTPResponse a file-like object that works as a context manager, plus .info() return the meta-information of the page
# unconfirmed: seems that a timeout set in the urlopen() call also effects the read() call:
opener = urllib.request.urlopen(openthisurl, timeout=mysockettimeout) # if no timeout may be infinite (if no socket.setdefaulttimeout() set)
# could also do in 2 steps, req = request, then req.urlopen(), but we do at once
except HTTPError as e:
if e.code == 404: # if image not found , can be this is not a model ID at all, or, is model ID but image deleted
if opener is not None: opener.close() # normally opener is None if we are here, but let's be 101% sure and if not none, close() it
return None, openthisurl + ' = 404 not found error' # check if got bio, if not, is not model so remove ID from next loop
else: # for example 503 service temporary unavailable
print('Could not urlretrieve HTTPError', openthisurl, e.__class__, e, e.code) # print in shell
if opener is not None: opener.close() # normally opener is None if we are here, but let's be 101% sure and if not none, close() it
return None, openthisurl + ' was an HTTPError: ' + str(e.code)
# here catches also: urllib.error.HTTPError: HTTP Error 404: Not Found
# print('Could not urlretrieve ', openthisurl, e.__class__, e, file=mylogz) # print in log file if fail
except URLError as e:
print('Could not urlretrieve URLError', openthisurl, e.__class__, e, e.args) # print in shell
if hasattr(e, 'reason'): # <--
print('We failed to reach a server. Reason: ', e.reason)
elif hasattr(e, 'code'): # <--
print('The server could not fulfill the request. Error code: ', e.code)
# print('Could not urlretrieve ', openthisurl, e.__class__, e, file=mylogz) # print in log file if fail
if opener is not None: opener.close() # normally opener is None if we are here, but let's be 101% sure and if not none, close() it
return None, openthisurl + ' was an url error'
except Exception as e:
print('Could not urlretrieve: ', openthisurl, e.__class__, e) # print in shell
if opener is not None: opener.close() # normally opener is None if we are here, but let's be 101% sure and if not none, close() it
return None, openthisurl + ' was an error'

# ok if we are here
return opener, openthisurl # pass the urlopen obj instead of None

######## end of getBusyList()

*********

Also you may want to multithread your thing to make it run fast in a loop (or it really never ends as sequential):


# with futures.ProcessPoolExecutor(max_workers=16) as executorxx: # needs if __name__ == '__main__' check...
with futures.ThreadPoolExecutor(max_workers=16) as executorxx:


for xmodelid in includeids: # run for fish new accounts

# note, the order of completion may be not same range serie! Some http call can take more, less or even timeout
# submit(fn, *args, **kwargs) Schedules the callable, fn, to be executed as fn(*args **kwargs) and returns a Future object
futurejobz = executorxx.submit(getSacImage, xmodelid, sacfolder) # same as: getSacImage(xmodelid, sacfolder)

# add_done_callback(fn): fn will be called when the future is cancelled or finishes running.
futurejobz.add_done_callback(printresults) # called at end of this job, replace: printresults(xmodelid, sacresult)

print("Time End Loop: " + time.strftime("%H:%M:%S")) # print time and date
__________________

TubeCamGirl.com
adultmobile is offline   Share thread on Digg Share thread on Twitter Share thread on Reddit Share thread on Facebook Reply With Quote