Don't use socket but other libs more http-oriented. Also you seem to use python 2.. I use python 3 only since couple of years
I paste below some pieces of code (sorry for mess) that load stuff ok in python 3 also via cloudflare's:
def getSacImage(sacimageid, sacfolder):
# list here all the global variables accessed from getSacImage, or will consider local
sacimageidstr = str(sacimageid)
imageaddr = 'http://site.com/folder/' + sacimageidstr + '.jpg'
# print('imageaddr:' + imageaddr)
# should use WITH opener to be more clean...
opener = None # define it, so in exception case not to give UnboundLocalError: local variable 'opener' referenced before assignment
opener, errormesg = getGeneric(imageaddr)
if opener is None: # there was an error?
return sacimageid, errormesg # error message includes the URL
# End of Try block, urlopen worked if we are here
# detect type and length...
imgtype = opener.headers.get("Content-Type")
imgsize = opener.headers.get("Content-Length")
# print('Content-Type: ' + imgtype + ', Content-Length: ' + imgsize)
# to know if there is a new image or its still old, can compare the last-modified ( + content lenght optionally)
lastmod = opener.headers.get("Last-Modified")
# print('Last-Modified: ' + lastmod) # ex: Tue, 04 Aug 2009 17:47:52 GMT
local_file = None # define so exists if fails on try (or I get UnboundLocalError)
outpath = None # define so exists if fails on try (or I get UnboundLocalError)
# only copy if is a jpg
if (imgtype == 'image/jpeg'):
image_datetime_obj = datetime.strptime(lastmod, '%a, %d %b %Y %H:%M:%S %Z') # parse Last-Modified into time obj
# (note: %Z for time zone name like GMT; would need %z if UTC offset in the form +HHMM or -HHMM )
# note: time.strptime() returns different type than datetime.strptime() !
image_datetime_plusoneday = image_datetime_obj + timedelta(days=1) # create datetime set to image time + 1 day
if image_datetime_plusoneday < datetime.now(): # datetime obj + timedelta obj 1 day less than datetime now?
if opener is not None: opener.close()
return sacimageid,' image older than 24h, too old, skipped'
# prepare date to be ending file name after ID
filedatemod = lastmod[5:] # strip leading day
filedatemod = filedatemod[:-4] #strip ending GMT
filedatemod = filedatemod.replace(" ", "") # no spaces
filedatemod = filedatemod.replace(":", "") # no :
# print(filedatemod)
try:
outpath = os.path.join(sacfolder, sacimageidstr + '_' + filedatemod + '.jpg')
#print('path: ' + outpath)
except Exception as e:
print('Could not do OS path ', outpath, e.__class__, e) # print in shell
# print('Could not do OS path', outpath, e.__class__, e, file=mylogz) # print in log file if fail
if opener is not None: opener.close()
return sacimageid,' was a local file path error'
if(os.path.isfile(outpath)): # if file exists with same date... no need to re-download
if opener is not None: opener.close()
return sacimageid,' already got the file downloaded before' # not modified since
try:
#print("downloading " + imageaddr)
local_file = open(outpath, 'wb')
except Exception as e:
print('Could not open ', outpath, e.__class__, e) # print in shell
# print('Could not open', outpath, e.__class__, e, file=mylogz) # print in log file if fail
if local_file is not None: local_file.close()
if opener is not None: opener.close()
return sacimageid,' was a local file open error'
try:
local_file.write(opener.read())
except Exception as e:
print('Could not read from web or write to local file: ', outpath, e.__class__, e) # print in shell
if local_file is not None: local_file.close()
if opener is not None: opener.close()
return sacimageid,' was a local file write error'
opener.close()
local_file.close()
return sacimageid,' was an image'
#
else: # not an image...
opener.close()
return sacimageid,' not an image (what is that!?)'
*********
def getGeneric(openthisurl): # returns list includeid
# print('openthisurl:' + openthisurl)
opener = None # define it, so in exception case not to give UnboundLocalError: local variable 'opener' referenced before assignment
#
https://docs.python.org/3.3/library/...urllib.request
# For http and https urls, this function returns a http.client.HTTPResponse
#
https://docs.python.org/3.3/library/...sponse-objects
# without a timeout, at the read() later, it can keep hanged forever (not even ctrl+c works... on windows)
# The default timeout for urllib2 is infinite
# By default the socket module has no timeout and can hang. Currently, the socket timeout is not
# exposed at the httplib or urllib2 levels. However, you can set the default timeout globally for all sockets
mysockettimeout = 10 # used both in socket.setdefaulttimeout() below and urllib.request.urlopen() later
# To set the socket timeout (global to all functions) is not necessary, enoigh to set a timeout on urlopen ? (to be tested)
# socket.setdefaulttimeout(mysockettimeout) # set the global socket timeout, in seconds, for all users of the socket module
try:
# urllib.request.urlopen(url, data=None[, timeout], *, cafile=None, capath=None, cadefault=True)
# returns http.client.HTTPResponse a file-like object that works as a context manager, plus .info() return the meta-information of the page
# unconfirmed: seems that a timeout set in the urlopen() call also effects the read() call:
opener = urllib.request.urlopen(openthisurl, timeout=mysockettimeout) # if no timeout may be infinite (if no socket.setdefaulttimeout() set)
# could also do in 2 steps, req = request, then req.urlopen(), but we do at once
except HTTPError as e:
if e.code == 404: # if image not found , can be this is not a model ID at all, or, is model ID but image deleted
if opener is not None: opener.close() # normally opener is None if we are here, but let's be 101% sure and if not none, close() it
return None, openthisurl + ' = 404 not found error' # check if got bio, if not, is not model so remove ID from next loop
else: # for example 503 service temporary unavailable
print('Could not urlretrieve HTTPError', openthisurl, e.__class__, e, e.code) # print in shell
if opener is not None: opener.close() # normally opener is None if we are here, but let's be 101% sure and if not none, close() it
return None, openthisurl + ' was an HTTPError: ' + str(e.code)
# here catches also: urllib.error.HTTPError: HTTP Error 404: Not Found
# print('Could not urlretrieve ', openthisurl, e.__class__, e, file=mylogz) # print in log file if fail
except URLError as e:
print('Could not urlretrieve URLError', openthisurl, e.__class__, e, e.args) # print in shell
if hasattr(e, 'reason'): # <--
print('We failed to reach a server. Reason: ', e.reason)
elif hasattr(e, 'code'): # <--
print('The server could not fulfill the request. Error code: ', e.code)
# print('Could not urlretrieve ', openthisurl, e.__class__, e, file=mylogz) # print in log file if fail
if opener is not None: opener.close() # normally opener is None if we are here, but let's be 101% sure and if not none, close() it
return None, openthisurl + ' was an url error'
except Exception as e:
print('Could not urlretrieve: ', openthisurl, e.__class__, e) # print in shell
if opener is not None: opener.close() # normally opener is None if we are here, but let's be 101% sure and if not none, close() it
return None, openthisurl + ' was an error'
# ok if we are here
return opener, openthisurl # pass the urlopen obj instead of None
######## end of getBusyList()
*********
Also you may want to multithread your thing to make it run fast in a loop (or it really never ends as sequential):
# with futures.ProcessPoolExecutor(max_workers=16) as executorxx: # needs if __name__ == '__main__' check...
with futures.ThreadPoolExecutor(max_workers=16) as executorxx:
for xmodelid in includeids: # run for fish new accounts
# note, the order of completion may be not same range serie! Some http call can take more, less or even timeout
# submit(fn, *args, **kwargs) Schedules the callable, fn, to be executed as fn(*args **kwargs) and returns a Future object
futurejobz = executorxx.submit(getSacImage, xmodelid, sacfolder) # same as: getSacImage(xmodelid, sacfolder)
# add_done_callback(fn): fn will be called when the future is cancelled or finishes running.
futurejobz.add_done_callback(printresults) # called at end of this job, replace: printresults(xmodelid, sacresult)
print("Time End Loop: " + time.strftime("%H:%M:%S")) # print time and date