import pandas as pd import numpy as np import re import settings timelog = pd.read_csv(settings.pomodoro_logfile()) # Dump bad data. The real solution here is to get rid of the damned 'Cancel' # button on the Pomodoro Prompt dialog, but i don't know how to do that, so we # need to drop the rows where the work task description is blank, which is # coming in as not a number for reasons i'm not entirely clear on. Maybe # because it's the last row of the spreadsheet? Anyway we cannot do anything # with no data in the description, so drop them at the outset. timelog = timelog.dropna() timelog = timelog.reset_index(drop=True) # For debugging, keep original description around. timelog["orig_desc"] = timelog["description"] # Clean up description before we go to work on it. timelog['description'] = timelog['description'].str.strip() # Allow multiple entries to be put into one prompt by splitting with semicolon. # TODO make this a flag since it's possible to use semicolons without meaning # to make multiple task entries at once. timelog["description"] = list(timelog["description"].str.split(";")) timelog = timelog.explode("description").reset_index() timelog["started"] = pd.to_datetime(timelog["started"]).dt.tz_convert("US/Eastern") timelog["recorded"] = pd.to_datetime(timelog["recorded"]).dt.tz_convert("US/Eastern") latest_recorded = settings.pomodoro_latest_recorded() if latest_recorded: timelog = timelog.query("recorded>" + latest_recorded) timelog["time"] = 30 # A pomodoro started before 3am Eastern time is considered to be a continuation # of the day before, so we are, effectively, on West Coast time for determining # the day we want to associate a time entry with. PomodoroPrompt saves as UTC. timelog["date"] = timelog["started"].dt.tz_convert("US/Pacific").dt.date timelog["day_of_week"] = pd.to_datetime(timelog["date"]).dt.day_name() # If a project has been specified (task prefixed with a colon), then put the # project in its own column. timelog['project'] = (np.where(timelog['description'].str.contains(': '), timelog['description'].str.split(': ', 1).str[0], None)) timelog['description'] = (np.where(timelog['description'].str.contains(': '), timelog['description'].str.split(': ', 1).str[1], timelog['description'])) # Mid-work clean up of description and new project. timelog['description'] = timelog['description'].str.strip() timelog['project'] = timelog['project'].str.strip() # If a multiplier has been provided (an asterisk and an integer at the end of a # task), then multiply the time by it and remove it from the description. # Ensure we're splitting on the same asterisk we found: Use the end of string # signifier in the regular expression ($), and split from the right. p = re.compile(r'\*\s*\d$') # On some systems, using np.where worked but others failed. Why it worked is # unknown but why it failed is because numpy where evaluates all parts, even # the parts that will never get used because the where clause does not apply! # This caused the chained strings to fail because— no string. # timelog['tmp_multiplier'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[1].str.strip(), 1)) # timelog['description'] = (np.where(timelog['description'].str.contains('\*\s*\d$'), timelog['description'].str.rsplit('*', 1).str[0], timelog['description'])) timelog['tmp_multiplier'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[1].strip() if p.search(x) else 1) timelog['description'] = timelog['description'].apply(lambda x: x.rsplit('*', 1)[0] if p.search(x) else x) timelog["time"] = timelog["time"] * timelog['tmp_multiplier'].astype(int) timelog.drop(columns=['tmp_multiplier'], inplace=True) # Clean up description again, after it has been sliced and diced. timelog['description'] = timelog['description'].str.strip() # Replace irregular-but-known project names with ones timetracking tools use. replacement_project_names = { "Find It Cambridge": ["Find It", "FIC", "Cambridge"], "The Propaganda Site": ["TPS", "Propaganda Site"], "MASS Design Group": ["MASS"], "Teachers with GUTS": ["TWIG", "GUTS"], "NICHQ Support": ["NICHQ", "NICHQ support", "nichq"], "Network engagement": ["Network Engagement", "network engagement", "Network engagment", "Social media", "Network building", "Agaric network engagement"], "Agaric internal": ["Agaric", "Internal"], "Agaric contrib": ["Contributing", "Contrib"], "Leads": ["Lead", "Agaric leads", "Lead followups"], "Learning": ["Personal learning"], "Drutopia": ["Drutopia improvements"], "Personal / external": ["Personal/external", "Personal", "External"], "Near North camp": ["Near North Camp", "Near North defense", "Encampment support", "Camp support"], } # TODO Probably put all alternatives in lower case and do str.lower() on # project just before the "is in" check. for preferred, alternatives in replacement_project_names.items(): timelog.loc[timelog.project.isin(alternatives), "project"] = preferred # Condense duplicate entries by date, summing the minutes spent, and listing # the first started and last recorded times for each task. # The fillna is essential or we drop entries with blank ('None') projects. tl = timelog.groupby(["date", timelog.project.fillna(""), "description"]).agg({"time": 'sum', "started": 'min', "recorded": 'max'}).reset_index() if hasattr(sys, 'ps1'): tl.to_csv('harvest-ready.csv', index=False) settings.pomodoro_latest_recorded(tl.recorded.max()) else: print("We do not write to the harvest-ready.csv nor update the latest recorded setting when run interactively in the python shell.")