wubloader/wubloader/wubloader.py


"""
The central management class which everything else is run from.
Its lifecycle is managed directly by main().
"""

from calendar import timegm
import logging
import time
import socket

import gevent.event
import gevent.pool

from .heartbeat import Heartbeat
from .job import Job
from .sheets import SheetsManager
from . import states


class Wubloader(object):
	JOBS_POLL_INTERVAL = 0.5

	def __init__(self, config):
		self.config = config

		self.bustime_base = timegm(time.strptime(config['bustime_start'], '%Y-%m-%dT%H:%M:%SZ'))

		self.name = config.get('name', socket.gethostname())
		self.sheets = SheetsManager(config['sheets'], config['creds'])

		self.stopping = False
		self.stopped = gevent.event.Event()

		# self.group contains all sub-greenlets and is used to ensure they're all shut down before exiting
		self.group = gevent.pool.Group()
		# self.job is kept as a seperate reference here so it's cancellable
		self.job = None
		# self.uploads is a group tracking all currently ongoing uploads.
		# note it's a subset of self.group
		self.uploads = gevent.pool.Group()

		self.heartbeat = Heartbeat(self.sheets['heartbeat'], self.name, self.group)

		gevent.spawn(self._run)

	def stop(self):
		"""Tell wubloader to gracefully stop by finishing current jobs but starting no new ones."""
		self.stopping = True

	def cancel_all(self):
		"""Tell wubloader to forcefully stop by cancelling current jobs."""
		if self.job:
			self.job.cancel()
		self.uploads.kill(block=False)

	def _run(self):
		# clean up in case of prior unclean shutdown
		self.cleanup_existing()

		# heartbeat will periodically update a sheet to indicate we're alive,
		# and tell us who else is alive.
		with self.heartbeat:
			while not self.stopping:
				for job in self.find_jobs():

					# If it's already claimed (except by us), ignore it.
					# Note this check considers a claim by a dead bot to be invalid (except for publishes).
					if job.uploader and job.uploader != self.name:
						continue

					# If we're not allowed to claim it, ignore it.
					if self.name.lower() in job.excluded:
						continue

					# Acceptance checks
					try:
						# Checking duration exercises start time and end time parsing,
						# which raise ValueError if they're bad.
						if job.duration <= 0:
							raise ValueError("Duration is {} sec, which is <= 0".format(job.duration))
					except ValueError as e:
						# Note that as acceptance checks are fixable, we do not put job into an error state.
						# Job will proceed as soon as it's fixed.
						# We only inform user of errors if notes field is blank to avoid overwriting more important info.
						if not job.row.notes:
							job.row.update(notes="Acceptance check failed: {}".format(e))
						continue
					# Acceptance tests passed, remove existing note on failed checks if present
					if job.row.notes.startswith("Acceptance check failed: "):
						job.row.update(notes="")

					# Do we have all the data?
					# TODO if we don't, check if end time is recent. if so, skip for now.
					#      if not, initiate claim-with-holes process

					# We've claimed the job, process it.
					self.job = job
					self.job.process()

					# Exit the loop to check stopping and restart our scan for eligible jobs.
					break

				else:
					# We reached the end of the jobs list and didn't find any jobs to do
					gevent.sleep(self.JOBS_POLL_INTERVAL)

		# wait for any remaining tasks to finish
		self.group.join()
		# indicate that we're done
		self.stopped.set()

	def cleanup_existing(self):
		"""Scan for any existing non-publish rows claimed by us, and cancel them."""
		for job in self.find_jobs():
			if job.row.uploader == self.name and job.row.state != states.rollback(job.row.state):
				logging.warning("Found existing in progress job for us, clearing")
				job.row.update(state=states.rollback(job.row.state))
				if job.job_type != 'publish':
					job.row.update(uploader="")

	def find_jobs(self):
		"""Return potential jobs (based only on state), in priority order."""
		jobs = []
		for sheet_type in ('main', 'chunks'):
			for sheet in self.sheets[sheet_type]:
				for row in sheet:
					if row.state in states.IS_ACTIONABLE:
						jobs.append(Job(self, sheet_type == 'chunks', sheet, row))
		return sorted(jobs, key=lambda job: job.priority)
Some initial stuff 6 years ago
			`"""`
			`The central management class which everything else is run from.`
			`Its lifecycle is managed directly by main().`
			`"""`

wip: 6 years ago			`from calendar import timegm`
			`import logging`
			`import time`
			`import socket`

			`import gevent.event`
			`import gevent.pool`

			`from .heartbeat import Heartbeat`
			`from .job import Job`
			`from .sheets import SheetsManager`
			`from . import states`

Some initial stuff 6 years ago
			`class Wubloader(object):`
wip: 6 years ago			`JOBS_POLL_INTERVAL = 0.5`

Some initial stuff 6 years ago			`def __init__(self, config):`
			`self.config = config`

wip: 6 years ago			`self.bustime_base = timegm(time.strptime(config['bustime_start'], '%Y-%m-%dT%H:%M:%SZ'))`

			`self.name = config.get('name', socket.gethostname())`
Some initial stuff 6 years ago			`self.sheets = SheetsManager(config['sheets'], config['creds'])`

			`self.stopping = False`
			`self.stopped = gevent.event.Event()`

wip: 6 years ago			`# self.group contains all sub-greenlets and is used to ensure they're all shut down before exiting`
Some initial stuff 6 years ago			`self.group = gevent.pool.Group()`
wip: 6 years ago			`# self.job is kept as a seperate reference here so it's cancellable`
Some initial stuff 6 years ago			`self.job = None`
wip: 6 years ago			`# self.uploads is a group tracking all currently ongoing uploads.`
			`# note it's a subset of self.group`
			`self.uploads = gevent.pool.Group()`

			`self.heartbeat = Heartbeat(self.sheets['heartbeat'], self.name, self.group)`
Some initial stuff 6 years ago
wip: 6 years ago			`gevent.spawn(self._run)`
Some initial stuff 6 years ago
			`def stop(self):`
			`"""Tell wubloader to gracefully stop by finishing current jobs but starting no new ones."""`
			`self.stopping = True`

			`def cancel_all(self):`
			`"""Tell wubloader to forcefully stop by cancelling current jobs."""`
			`if self.job:`
			`self.job.cancel()`
wip: 6 years ago			`self.uploads.kill(block=False)`
Some initial stuff 6 years ago
			`def _run(self):`
			`# clean up in case of prior unclean shutdown`
			`self.cleanup_existing()`

wip: 6 years ago			`# heartbeat will periodically update a sheet to indicate we're alive,`
			`# and tell us who else is alive.`
			`with self.heartbeat:`
Some initial stuff 6 years ago			`while not self.stopping:`
			`for job in self.find_jobs():`
wip: 6 years ago
Further changes to make sure drafts and chunks can still be unclaimed 6 years ago			`# If it's already claimed (except by us), ignore it.`
Further protection to prevent claimant changing for publish jobs Since drafter must also be publisher, we don't allow anything to ever cause publish jobs' claimant to change. 6 years ago			`# Note this check considers a claim by a dead bot to be invalid (except for publishes).`
Never unclaim a task you've claimed, ensures publisher is always same as drafter This is needed as exact times don't line up between instances 6 years ago			`if job.uploader and job.uploader != self.name:`
wip: 6 years ago			`continue`

			`# If we're not allowed to claim it, ignore it.`
			`if self.name.lower() in job.excluded:`
			`continue`

			`# Acceptance checks`
			`try:`
			`# Checking duration exercises start time and end time parsing,`
			`# which raise ValueError if they're bad.`
			`if job.duration <= 0:`
			`raise ValueError("Duration is {} sec, which is <= 0".format(job.duration))`
			`except ValueError as e:`
			`# Note that as acceptance checks are fixable, we do not put job into an error state.`
			`# Job will proceed as soon as it's fixed.`
			`# We only inform user of errors if notes field is blank to avoid overwriting more important info.`
			`if not job.row.notes:`
			`job.row.update(notes="Acceptance check failed: {}".format(e))`
			`continue`
			`# Acceptance tests passed, remove existing note on failed checks if present`
			`if job.row.notes.startswith("Acceptance check failed: "):`
			`job.row.update(notes="")`

			`# Do we have all the data?`
			`# TODO if we don't, check if end time is recent. if so, skip for now.`
			`# if not, initiate claim-with-holes process`

			`# We've claimed the job, process it.`
			`self.job = job`
			`self.job.process()`

			`# Exit the loop to check stopping and restart our scan for eligible jobs.`
			`break`

			`else:`
			`# We reached the end of the jobs list and didn't find any jobs to do`
			`gevent.sleep(self.JOBS_POLL_INTERVAL)`
Some initial stuff 6 years ago
			`# wait for any remaining tasks to finish`
			`self.group.join()`
			`# indicate that we're done`
			`self.stopped.set()`

			`def cleanup_existing(self):`
Further changes to make sure drafts and chunks can still be unclaimed 6 years ago			`"""Scan for any existing non-publish rows claimed by us, and cancel them."""`
			`for job in self.find_jobs():`
			`if job.row.uploader == self.name and job.row.state != states.rollback(job.row.state):`
			`logging.warning("Found existing in progress job for us, clearing")`
			`job.row.update(state=states.rollback(job.row.state))`
			`if job.job_type != 'publish':`
			`job.row.update(uploader="")`
wip: 6 years ago
			`def find_jobs(self):`
			`"""Return potential jobs (based only on state), in priority order."""`
			`jobs = []`
			`for sheet_type in ('main', 'chunks'):`
			`for sheet in self.sheets[sheet_type]:`
			`for row in sheet:`
			`if row.state in states.IS_ACTIONABLE:`
			`jobs.append(Job(self, sheet_type == 'chunks', sheet, row))`
			`return sorted(jobs, key=lambda job: job.priority)`