video: tegra: host: add idle time estimate in 3dfs
Ilan Aelion [Tue, 28 Aug 2012 17:05:36 +0000 (11:05 -0600)]
When a throughput hint is available, still keep track of the idle time
percentage and use both in gpu scaling. Also modifying fallback
scaling code to use same idle estimate used when a throughput hint is
available. Dropping different time frames for scaling up and down.

Bug 1034948
Bug 965517

Change-Id: Ib3945642768e36a6c0c50f4195c89e3bb67f8442
Signed-off-by: Ilan Aelion <iaelion@nvidia.com>
Reviewed-on: http://git-master/r/129410
(cherry picked from commit 1e9974097286a78f34367683a3921c9b5bf77d4d)
Reviewed-on: http://git-master/r/131155
Reviewed-by: Varun Colbert <vcolbert@nvidia.com>
Tested-by: Varun Colbert <vcolbert@nvidia.com>

drivers/misc/tegra-throughput.c
drivers/video/tegra/host/gr3d/scale3d.c

index 366b0ec..d491ef8 100644 (file)
@@ -46,35 +46,34 @@ static void set_throughput_hint(struct work_struct *work)
 
 static int throughput_flip_callback(void)
 {
+       long timediff;
+       ktime_t now;
+
        /* only register flips when a single app is active */
        if (multiple_app_disable)
                return NOTIFY_DONE;
-       else {
-               long timediff;
-               ktime_t now;
-
-               now = ktime_get();
-               if (last_flip.tv64 != 0) {
-                       timediff = (long) ktime_us_delta(now, last_flip);
-                       if (timediff > (long) USHRT_MAX)
-                               last_frame_time = USHRT_MAX;
-                       else
-                               last_frame_time = (unsigned short) timediff;
-
-                       if (last_frame_time == 0) {
-                               pr_warn("%s: flips %lld nsec apart\n",
-                                       __func__, now.tv64 - last_flip.tv64);
-                               return NOTIFY_DONE;
-                       }
-
-                       throughput_hint =
-                               ((int) target_frame_time * 100)/last_frame_time;
-
-                       if (!work_pending(&work))
-                               schedule_work(&work);
+
+       now = ktime_get();
+       if (last_flip.tv64 != 0) {
+               timediff = (long) ktime_us_delta(now, last_flip);
+               if (timediff > (long) USHRT_MAX)
+                       last_frame_time = USHRT_MAX;
+               else
+                       last_frame_time = (unsigned short) timediff;
+
+               if (last_frame_time == 0) {
+                       pr_warn("%s: flips %lld nsec apart\n",
+                               __func__, now.tv64 - last_flip.tv64);
+                       return NOTIFY_DONE;
                }
-               last_flip = now;
+
+               throughput_hint =
+                       ((int) target_frame_time * 1000) / last_frame_time;
+
+               if (!work_pending(&work))
+                       schedule_work(&work);
        }
+       last_flip = now;
 
        return NOTIFY_OK;
 }
index fc30c22..4914797 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * drivers/video/tegra/host/t20/scale3d.c
+ * drivers/video/tegra/host/gr3d/scale3d.c
  *
  * Tegra Graphics Host 3D clock scaling
  *
  *
  * module3d_notify_busy() is called upon submit, module3d_notify_idle() is
  * called when all outstanding submits are completed. Idle times are measured
- * over a fixed time period (scale3d.p_period). If the 3d module idle time
- * percentage goes over the limit (set in scale3d.p_idle_max), 3d clocks are
- * scaled down. If the percentage goes under the minimum limit (set in
- * scale3d.p_idle_min), 3d clocks are scaled up. An additional test is made
- * over the time frame given in scale3d.p_fast_response for clocking up
- * quickly in response to load peaks.
+ * over a fixed time period (scale3d.p_estimation_window). If the 3d module
+ * idle time percentage goes over the limit (set in scale3d.p_idle_max), 3d
+ * clocks are scaled down. If the percentage goes under the minimum limit (set
+ * in scale3d.p_idle_min), 3d clocks are scaled up. An additional test is made
+ * for clocking up quickly in response to load peaks.
  *
  * 3d.emc clock is scaled proportionately to 3d clock, with a quadratic-
  * bezier-like factor added to pull 3d.emc rate a bit lower.
 #include <linux/debugfs.h>
 #include <linux/types.h>
 #include <linux/clk.h>
+#include <linux/slab.h>
 #include <mach/clk.h>
 #include <mach/hardware.h>
 #include "scale3d.h"
 #include "dev.h"
 #include <media/tegra_camera.h>
 
+#define GR3D_PRINT_STATS   BIT(1)
+#define GR3D_PRINT_BUSY    BIT(2)
+#define GR3D_PRINT_IDLE    BIT(3)
+#define GR3D_PRINT_HINT    BIT(4)
+#define GR3D_PRINT_TARGET  BIT(5)
+
+/* time frame for load and hint tracking - when events come in at a larger
+ * interval, this probably indicates the current estimates are stale
+ */
+#define GR3D_TIMEFRAME 1000000 /* 1 sec */
+
+/* the number of frames to use in the running average of load estimates and
+ * throughput hints. Choosing 6 frames targets a window of about 100 msec.
+ * Large flucutuations in frame times require a window that's large enough to
+ * prevent spiky scaling behavior, which in turn exacerbates frame rate
+ * instability.
+ */
+#define GR3D_FRAME_SPAN 6
+
 static int scale3d_is_enabled(void);
 static void scale3d_enable(int enable);
 
@@ -60,57 +79,62 @@ static void scale3d_enable(int enable);
 /*
  * debugfs parameters to control 3d clock scaling test
  *
- * period        - time period for clock rate evaluation
- * fast_response - time period for evaluation of 'busy' spikes
- * idle_min      - if less than [idle_min] percent idle over [fast_response]
- *                 microseconds, clock up.
- * idle_max      - if over [idle_max] percent idle over [period] microseconds,
- *                 clock down.
+ * estimation_window  - time period for clock rate evaluation
+ * idle_min           - if less than [idle_min / 10] percent idle over
+ *                      [estimation_window] microseconds, clock up.
+ * idle_max      - if over [idle_max] percent idle over [estimation_window]
+ *                 microseconds, clock down.
  * max_scale     - limits rate changes to no less than (100 - max_scale)% or
  *                 (100 + 2 * max_scale)% of current clock rate
- * verbosity     - set above 5 for debug printouts
+ * verbosity     - bit flag to control debug printouts:
+ *                 1 - stats
+ *                 2 - busy
+ *                 3 - idle
+ *                 4 - hints
+ *                 5 - target frequencies
  */
 
 struct scale3d_info_rec {
        struct mutex lock; /* lock for timestamps etc */
        int enable;
        int init;
-       ktime_t idle_frame;
-       ktime_t fast_frame;
-       ktime_t last_idle;
-       ktime_t last_short_term_idle;
+       ktime_t last_scale;
        int is_idle;
-       ktime_t last_tweak;
-       ktime_t last_down;
+       ktime_t last_adjust;
        int fast_up_count;
        int slow_down_count;
        int is_scaled;
-       int fast_responses;
-       unsigned long idle_total;
-       unsigned long idle_short_term_total;
-       unsigned long max_rate_3d;
        long emc_slope;
        long emc_offset;
        long emc_dip_slope;
        long emc_dip_offset;
        long emc_xmid;
+       unsigned long max_rate_3d;
        unsigned long min_rate_3d;
        ktime_t last_throughput_hint;
+
        struct work_struct work;
        struct delayed_work idle_timer;
+
+       ktime_t last_estimation_window;
+       long last_total_idle;
+       long total_idle;
+       ktime_t estimation_window;
+       ktime_t last_notification;
+       long idle_estimate;
+
        unsigned int scale;
+       unsigned int p_busy_cutoff;
+       unsigned int p_estimation_window;
        unsigned int p_use_throughput_hint;
        unsigned int p_throughput_lo_limit;
+       unsigned int p_throughput_lower_limit;
        unsigned int p_throughput_hi_limit;
        unsigned int p_scale_step;
-       unsigned int p_period;
-       unsigned int period;
        unsigned int p_idle_min;
        unsigned int idle_min;
        unsigned int p_idle_max;
        unsigned int idle_max;
-       unsigned int p_fast_response;
-       unsigned int fast_response;
        unsigned int p_adjust;
        unsigned int p_scale_emc;
        unsigned int p_emc_dip;
@@ -118,13 +142,15 @@ struct scale3d_info_rec {
        struct clk *clk_3d;
        struct clk *clk_3d2;
        struct clk *clk_3d_emc;
+       int *freqlist;
+       int freq_count;
 };
 
 static struct scale3d_info_rec scale3d;
 
-static void scale3d_clocks(unsigned long percent)
+static void scale_to_freq(unsigned long hz)
 {
-       unsigned long hz, curr;
+       unsigned long curr;
 
        if (!tegra_is_clk_enabled(scale3d.clk_3d))
                return;
@@ -134,7 +160,8 @@ static void scale3d_clocks(unsigned long percent)
                        return;
 
        curr = clk_get_rate(scale3d.clk_3d);
-       hz = percent * (curr / 100);
+       if (hz == curr)
+               return;
 
        if (!(hz >= scale3d.max_rate_3d && curr == scale3d.max_rate_3d)) {
                if (tegra_get_chipid() == TEGRA_CHIPID_TEGRA3)
@@ -154,6 +181,16 @@ static void scale3d_clocks(unsigned long percent)
        }
 }
 
+static void scale3d_clocks(unsigned long percent)
+{
+       unsigned long hz, curr;
+
+       curr = clk_get_rate(scale3d.clk_3d);
+       hz = percent * (curr / 100);
+
+       scale_to_freq(hz);
+}
+
 static void scale3d_clocks_handler(struct work_struct *work)
 {
        unsigned int scale;
@@ -236,15 +273,6 @@ static void scale3d_enable(int enable)
                reset_3d_clocks();
 }
 
-static void reset_scaling_counters(ktime_t time)
-{
-       scale3d.idle_total = 0;
-       scale3d.idle_short_term_total = 0;
-       scale3d.last_idle = time;
-       scale3d.last_short_term_idle = time;
-       scale3d.idle_frame = time;
-}
-
 /* scaling_adjust - use scale up / scale down hint counts to adjust scaling
  * parameters.
  *
@@ -257,8 +285,6 @@ static void reset_scaling_counters(ktime_t time)
  *
  * the parameters adjusted are
  *
- * * fast_response time
- * * period - time for scaling down estimate
  * * idle_min percentage
  * * idle_max percentage
  */
@@ -271,13 +297,11 @@ static void reset_scaling_counters(ktime_t time)
 static void scaling_adjust(ktime_t time)
 {
        long hint_ratio;
-       long fast_response_adjustment;
-       long period_adjustment;
        int idle_min_adjustment;
        int idle_max_adjustment;
        unsigned long dt;
 
-       dt = (unsigned long) ktime_us_delta(time, scale3d.last_tweak);
+       dt = (unsigned long) ktime_us_delta(time, scale3d.last_adjust);
        if (dt < SCALING_ADJUST_PERIOD)
                return;
 
@@ -285,13 +309,9 @@ static void scaling_adjust(ktime_t time)
                                 (scale3d.slow_down_count + 1);
 
        if (hint_ratio > HINT_RATIO_MAX) {
-               fast_response_adjustment = -((int) scale3d.p_fast_response) / 4;
-               period_adjustment = scale3d.p_period / 2;
                idle_min_adjustment = scale3d.p_idle_min;
                idle_max_adjustment = scale3d.p_idle_max;
        } else if (hint_ratio < HINT_RATIO_MIN) {
-               fast_response_adjustment = scale3d.p_fast_response / 2;
-               period_adjustment = -((int) scale3d.p_period) / 4;
                idle_min_adjustment = -((int) scale3d.p_idle_min) / 2;
                idle_max_adjustment = -((int) scale3d.p_idle_max) / 2;
        } else {
@@ -306,33 +326,23 @@ static void scaling_adjust(ktime_t time)
                        diff *= 2;
                }
 
-               fast_response_adjustment = diff *
-                       (scale3d.p_fast_response / (HINT_RATIO_DIFF * 2));
-               period_adjustment =
-                       diff * (scale3d.p_period / HINT_RATIO_DIFF);
                idle_min_adjustment =
                        (factor * (int) scale3d.p_idle_min) / HINT_RATIO_DIFF;
                idle_max_adjustment =
                        (factor * (int) scale3d.p_idle_max) / HINT_RATIO_DIFF;
        }
 
-       scale3d.fast_response =
-               scale3d.p_fast_response + fast_response_adjustment;
-       scale3d.period = scale3d.p_period + period_adjustment;
-               scale3d.idle_min = scale3d.p_idle_min + idle_min_adjustment;
+       scale3d.idle_min = scale3d.p_idle_min + idle_min_adjustment;
        scale3d.idle_max = scale3d.p_idle_max + idle_max_adjustment;
 
-       if (scale3d.p_verbosity >= 10)
-               pr_info("scale3d stats: + %d - %d (/ %d) f %u p %u min %u max %u\n",
+       if (scale3d.p_verbosity & GR3D_PRINT_STATS)
+               pr_info("scale3d stats: + %d - %d min %u max %u\n",
                        scale3d.fast_up_count, scale3d.slow_down_count,
-                       scale3d.fast_responses, scale3d.fast_response,
-                       scale3d.period, scale3d.idle_min, scale3d.idle_max);
+                       scale3d.idle_min, scale3d.idle_max);
 
        scale3d.fast_up_count = 0;
        scale3d.slow_down_count = 0;
-       scale3d.fast_responses = 0;
-       scale3d.last_down = time;
-       scale3d.last_tweak = time;
+       scale3d.last_adjust = time;
 }
 
 #undef SCALING_ADJUST_PERIOD
@@ -345,61 +355,101 @@ static void scaling_state_check(ktime_t time)
 {
        unsigned long dt;
 
-       /* adjustment: set scale parameters (fast_response, period) +/- 25%
+       /* adjustment: set scale parameters (idle_min, idle_max) +/- 25%
         * based on ratio of scale up to scale down hints
         */
        if (scale3d.p_adjust)
                scaling_adjust(time);
        else {
-               scale3d.fast_response = scale3d.p_fast_response;
-               scale3d.period = scale3d.p_period;
                scale3d.idle_min = scale3d.p_idle_min;
                scale3d.idle_max = scale3d.p_idle_max;
        }
 
-       /* check for load peaks */
-       dt = (unsigned long) ktime_us_delta(time, scale3d.fast_frame);
-       if (dt > scale3d.fast_response) {
-               unsigned long idleness =
-                       (scale3d.idle_short_term_total * 100) / dt;
-               scale3d.fast_responses++;
-               scale3d.fast_frame = time;
-               /* if too busy, scale up */
-               if (idleness < scale3d.idle_min) {
-                       scale3d.is_scaled = 0;
-                       scale3d.fast_up_count++;
-                       if (scale3d.p_verbosity >= 5)
-                               pr_info("scale3d: %ld%% busy\n",
-                                       100 - idleness);
-
-                       reset_3d_clocks();
-                       reset_scaling_counters(time);
-                       return;
-               }
-               scale3d.idle_short_term_total = 0;
-               scale3d.last_short_term_idle = time;
+       dt = (unsigned long) ktime_us_delta(time, scale3d.last_scale);
+       if (dt < scale3d.p_estimation_window)
+               return;
+
+       scale3d.last_scale = time;
+
+       /* if too busy, scale up */
+       if (scale3d.idle_estimate < scale3d.idle_min) {
+               scale3d.is_scaled = 0;
+               scale3d.fast_up_count++;
+               if (scale3d.p_verbosity & GR3D_PRINT_BUSY)
+                       pr_info("scale3d: %ld/1000 busy\n",
+                               1000 - scale3d.idle_estimate);
+
+               reset_3d_clocks();
+               return;
        }
 
-       dt = (unsigned long) ktime_us_delta(time, scale3d.idle_frame);
-       if (dt > scale3d.period) {
-               unsigned long idleness = (scale3d.idle_total * 100) / dt;
+       if (scale3d.p_verbosity & GR3D_PRINT_IDLE)
+               pr_info("scale3d: idle %lu/1000\n",
+                       scale3d.idle_estimate);
 
-               if (scale3d.p_verbosity >= 5)
-                       pr_info("scale3d: idle %lu, ~%lu%%\n",
-                               scale3d.idle_total, idleness);
+       if (scale3d.idle_estimate > scale3d.idle_max) {
+               if (!scale3d.is_scaled)
+                       scale3d.is_scaled = 1;
 
-               if (idleness > scale3d.idle_max) {
-                       if (!scale3d.is_scaled) {
-                               scale3d.is_scaled = 1;
-                               scale3d.last_down = time;
-                       }
-                       scale3d.slow_down_count++;
-                       /* if idle time is high, clock down */
-                       scale3d.scale = 100 - (idleness - scale3d.idle_min);
-                       schedule_work(&scale3d.work);
-               }
+               scale3d.slow_down_count++;
+               /* if idle time is high, clock down */
+               scale3d.scale =
+                       100 - (scale3d.idle_estimate - scale3d.idle_min) / 10;
+               schedule_work(&scale3d.work);
+       }
+}
+
+/* the idle estimate is done by keeping 2 time stamps, initially set to the
+ * same time. Once the estimation_window time has been exceeded, one time
+ * stamp is moved up to the current time. The idle estimate is calculated
+ * based on the idle time percentage from the earlier estimate. The next time
+ * an estimation_window time is exceeded, the previous idle time and estimates
+ * are moved up - this is intended to prevent abrupt changes to the idle
+ * estimate.
+ */
+static void update_load_estimate(int idle)
+{
+       unsigned long window;
+       unsigned long t;
+
+       ktime_t now = ktime_get();
+       t = ktime_us_delta(now, scale3d.last_notification);
+
+       /* if the last event was over GR3D_TIMEFRAME usec ago (1 sec), the
+        * current load tracking data is probably stale
+        */
+       if (t > GR3D_TIMEFRAME) {
+               scale3d.is_idle = idle;
+               scale3d.last_notification = now;
+               scale3d.estimation_window = now;
+               scale3d.last_estimation_window = now;
+               scale3d.total_idle = 0;
+               scale3d.last_total_idle = 0;
+               scale3d.idle_estimate = idle ? 1000 : 0;
+               return;
+       }
 
-               reset_scaling_counters(time);
+       if (scale3d.is_idle) {
+               scale3d.total_idle += t;
+               scale3d.last_total_idle += t;
+       }
+
+       scale3d.is_idle = idle;
+       scale3d.last_notification = now;
+
+       window = ktime_us_delta(now, scale3d.last_estimation_window);
+       /* prevent division by 0 if events come in less than 1 usec apart */
+       if (window > 0)
+               scale3d.idle_estimate =
+                       (1000 * scale3d.last_total_idle) / window;
+
+       /* move up to the last estimation window */
+       if (ktime_us_delta(now, scale3d.estimation_window) >
+               scale3d.p_estimation_window) {
+               scale3d.last_estimation_window = scale3d.estimation_window;
+               scale3d.last_total_idle = scale3d.total_idle;
+               scale3d.total_idle = 0;
+               scale3d.estimation_window = now;
        }
 }
 
@@ -407,136 +457,223 @@ void nvhost_scale3d_notify_idle(struct nvhost_device *dev)
 {
        ktime_t t;
        unsigned long dt;
+       int delay;
 
        if (!scale3d.enable)
                return;
 
+       update_load_estimate(1);
+
+       t = ktime_get();
+
        /* if throughput hint enabled, and last hint is recent enough, return */
        if (scale3d.p_use_throughput_hint) {
-               t = ktime_get();
-               if (ktime_us_delta(t, scale3d.last_throughput_hint) < 1000000)
+               dt = ktime_us_delta(t, scale3d.last_throughput_hint);
+               if (dt < GR3D_TIMEFRAME)
                        return;
        }
 
        mutex_lock(&scale3d.lock);
 
-       t = ktime_get();
-
-       if (scale3d.is_idle) {
-               dt = ktime_us_delta(t, scale3d.last_idle);
-               scale3d.idle_total += dt;
-               dt = ktime_us_delta(t, scale3d.last_short_term_idle);
-               scale3d.idle_short_term_total += dt;
-       } else
-               scale3d.is_idle = 1;
-
-       scale3d.last_idle = t;
-       scale3d.last_short_term_idle = t;
-
-       scaling_state_check(scale3d.last_idle);
+       scaling_state_check(t);
 
-       /* delay idle_max % of 2 * fast_response time (given in microseconds) */
-       schedule_delayed_work(&scale3d.idle_timer,
-               msecs_to_jiffies((scale3d.idle_max * scale3d.fast_response)
-                       / 50000));
+       /* delay idle_max % of 2 * estimation_window (given in microseconds) */
+       delay = (scale3d.idle_max * scale3d.p_estimation_window) / 500000;
+       schedule_delayed_work(&scale3d.idle_timer, msecs_to_jiffies(delay));
 
        mutex_unlock(&scale3d.lock);
 }
 
 void nvhost_scale3d_notify_busy(struct nvhost_device *dev)
 {
-       unsigned long idle;
-       unsigned long short_term_idle;
        ktime_t t;
 
        if (!scale3d.enable)
                return;
 
+       update_load_estimate(0);
+
+       t = ktime_get();
+
        /* if throughput hint enabled, and last hint is recent enough, return */
        if (scale3d.p_use_throughput_hint) {
-               t = ktime_get();
-               if (ktime_us_delta(t, scale3d.last_throughput_hint) < 1000000)
+               unsigned long dt;
+               dt = ktime_us_delta(t, scale3d.last_throughput_hint);
+               if (dt < GR3D_TIMEFRAME)
                        return;
        }
 
        mutex_lock(&scale3d.lock);
 
        cancel_delayed_work(&scale3d.idle_timer);
+       scaling_state_check(t);
 
-       t = ktime_get();
+       mutex_unlock(&scale3d.lock);
+}
 
-       if (scale3d.is_idle) {
-               idle = (unsigned long)
-                       ktime_us_delta(t, scale3d.last_idle);
-               scale3d.idle_total += idle;
-               short_term_idle =
-                       ktime_us_delta(t, scale3d.last_short_term_idle);
-               scale3d.idle_short_term_total += short_term_idle;
-               scale3d.is_idle = 0;
-       }
+struct score {
+       int size;               /* number of elements */
+       int pos;                /* position in ring buffer */
+       int count;              /* actual item count */
+       unsigned int sum;       /* running sum */
+       unsigned int prev;      /* previous score after 'reset' operation */
+       unsigned int list[];    /* ring buffer */
+};
 
-       scaling_state_check(t);
+static struct score *score_init(int capacity)
+{
+       struct score *s;
 
-       mutex_unlock(&scale3d.lock);
+       s = kzalloc(sizeof(struct score) + capacity * sizeof(int), GFP_KERNEL);
+       if (s == NULL)
+               return NULL;
+
+       s->size = capacity;
+
+       return s;
 }
 
-static void do_scale(int diff)
+static void score_delete(struct score *s)
 {
-       unsigned long hz, curr;
+       kfree(s);
+}
 
-       if (!tegra_is_clk_enabled(scale3d.clk_3d))
-               return;
+#define score_get_average(s) ((s)->count ? (s)->sum / (s)->count : 0)
 
-       if (tegra_get_chipid() == TEGRA_CHIPID_TEGRA3)
-               if (!tegra_is_clk_enabled(scale3d.clk_3d2))
-                       return;
+static void score_add(struct score *s, unsigned int reading)
+{
+       if (s->count < s->size) {
+               s->sum += reading;
+               s->count++;
+       } else
+               s->sum = s->sum - s->list[s->pos] + reading;
 
-       curr = clk_get_rate(scale3d.clk_3d);
-       hz = curr + diff;
+       s->list[s->pos] = reading;
+       s->pos = (s->pos + 1) % s->size;
+}
 
-       if (hz < scale3d.min_rate_3d)
-               hz = scale3d.min_rate_3d;
 
-       if (hz > scale3d.max_rate_3d)
-               hz = scale3d.max_rate_3d;
+static unsigned int score_reset(struct score *s)
+{
+       s->prev = s->sum;
 
-       if (hz == curr) return;
+       s->count = 0;
+       s->pos = 0;
+       s->sum = 0;
 
-       if (tegra_get_chipid() == TEGRA_CHIPID_TEGRA3)
-               clk_set_rate(scale3d.clk_3d2, 0);
-       clk_set_rate(scale3d.clk_3d, hz);
-
-       if (scale3d.p_scale_emc) {
-               long after = (long) clk_get_rate(scale3d.clk_3d);
-               hz = after * scale3d.emc_slope + scale3d.emc_offset;
-               if (scale3d.p_emc_dip)
-                       hz -=
-                               (scale3d.emc_dip_slope *
-                               POW2(after / 1000 - scale3d.emc_xmid) +
-                               scale3d.emc_dip_offset);
-               clk_set_rate(scale3d.clk_3d_emc, hz);
-       }
+       return s->prev;
 }
 
-#define scale_up() do_scale(scale3d.p_scale_step)
-#define scale_down() do_scale(-scale3d.p_scale_step)
+int freqlist_up(long target, int steps)
+{
+       int i, pos;
+
+       for (i = 0; i < scale3d.freq_count; i++)
+               if (scale3d.freqlist[i] >= target)
+                       break;
+
+       pos = min(scale3d.freq_count - 1, i + steps);
+       return scale3d.freqlist[pos];
+}
+
+int freqlist_down(long target, int steps)
+{
+       int i, pos;
+
+       for (i = scale3d.freq_count - 1; i >= 0; i--)
+               if (scale3d.freqlist[i] <= target)
+                       break;
+
+       pos = max(0, i - steps);
+       return scale3d.freqlist[pos];
+}
 
+static struct score *busy_history;
+static struct score *hint_history;
+
+/* When a throughput hint is given, perform scaling based on the hint and on
+ * the current idle estimation. This is done as follows:
+ *
+ * 1. On moderate loads force min frequency if the throughput hint is not too
+ *    low.
+ * 2. Otherwise, calculate target-rate = max-rate * load-percentage
+ * 3. Unless the current or average throughput hint is below the minimum
+ *    limit, in which case, choose a higher rate
+ * 4. Or the average throughput hint is above the maximum limit, in which case,
+ *    choose a lower rate.
+ */
 void nvhost_scale3d_set_throughput_hint(int hint)
 {
+       ktime_t now;
+       long busy;
+       long curr;
+       long target;
+       long dt;
+       int avg_busy, avg_hint;
+
        if (!scale3d.enable)
                return;
 
        if (!scale3d.p_use_throughput_hint)
                return;
 
-       scale3d.last_throughput_hint = ktime_get();
+       if (scale3d.p_verbosity & GR3D_PRINT_HINT)
+               pr_info("3fds: idle %ld, hint %d\n",
+                       scale3d.idle_estimate, hint);
 
-       if (scale3d.p_use_throughput_hint) {
-               if (hint >= scale3d.p_throughput_hi_limit)
-                       scale_down();
-               else if (hint <= scale3d.p_throughput_lo_limit)
-                       scale_up();
+       now = ktime_get();
+       dt = ktime_us_delta(now, scale3d.last_throughput_hint);
+       if (dt > GR3D_TIMEFRAME) {
+               score_reset(busy_history);
+               score_reset(hint_history);
        }
+
+       scale3d.last_throughput_hint = now;
+
+       busy = 1000 - scale3d.idle_estimate;
+       curr = clk_get_rate(scale3d.clk_3d);
+       target = scale3d.min_rate_3d;
+
+       score_add(busy_history, busy);
+       score_add(hint_history, hint);
+
+       avg_busy = score_get_average(busy_history);
+       avg_hint = score_get_average(hint_history);
+
+       if (busy > 0)
+               target = (curr / 1000) * busy;
+
+       /* In practice, running the gpu at min frequency is typically
+        * sufficient to keep up performance at loads up to 70% on cases,
+        * but the average hint value is tested to keep performance up if
+        * needed.
+        */
+       if (avg_busy <= scale3d.p_busy_cutoff &&
+           avg_hint >= scale3d.p_throughput_lower_limit)
+               target = scale3d.min_rate_3d;
+       else {
+               target = (scale3d.max_rate_3d / 1000) * avg_busy;
+
+               /* Scale up if either the current hint or the running average
+                * are below the target to prevent performance drop.
+                */
+               if (hint <= scale3d.p_throughput_lo_limit ||
+                   avg_hint <= scale3d.p_throughput_lo_limit) {
+                       if (target < curr)
+                               target = curr;
+                       target = freqlist_up(target, scale3d.p_scale_step);
+               } else if (avg_hint >= scale3d.p_throughput_hi_limit) {
+                       if (target > curr)
+                               target = curr;
+                       target = freqlist_down(target, scale3d.p_scale_step);
+               }
+       }
+
+       scale_to_freq(target);
+
+       if (scale3d.p_verbosity & GR3D_PRINT_TARGET)
+               pr_info("3dfs: busy %ld <%d>, curr %ld, t %ld, hint %d <%d>\n",
+                       busy, avg_busy, curr / 1000000, target, hint, avg_hint);
 }
 EXPORT_SYMBOL(nvhost_scale3d_set_throughput_hint);
 
@@ -561,19 +698,6 @@ static void scale3d_idle_handler(struct work_struct *work)
                nvhost_scale3d_notify_idle(NULL);
 }
 
-void nvhost_scale3d_reset()
-{
-       ktime_t t;
-
-       if (!scale3d.enable)
-               return;
-
-       t = ktime_get();
-       mutex_lock(&scale3d.lock);
-       reset_scaling_counters(t);
-       mutex_unlock(&scale3d.lock);
-}
-
 /*
  * debugfs parameters to control 3d clock scaling
  */
@@ -598,16 +722,16 @@ void nvhost_scale3d_debug_init(struct dentry *de)
                } \
        } while (0)
 
-       CREATE_SCALE3D_FILE(fast_response);
+       CREATE_SCALE3D_FILE(estimation_window);
        CREATE_SCALE3D_FILE(idle_min);
        CREATE_SCALE3D_FILE(idle_max);
-       CREATE_SCALE3D_FILE(period);
        CREATE_SCALE3D_FILE(adjust);
        CREATE_SCALE3D_FILE(scale_emc);
        CREATE_SCALE3D_FILE(emc_dip);
        CREATE_SCALE3D_FILE(use_throughput_hint);
        CREATE_SCALE3D_FILE(throughput_hi_limit);
        CREATE_SCALE3D_FILE(throughput_lo_limit);
+       CREATE_SCALE3D_FILE(throughput_lower_limit);
        CREATE_SCALE3D_FILE(scale_step);
        CREATE_SCALE3D_FILE(verbosity);
 #undef CREATE_SCALE3D_FILE
@@ -639,12 +763,17 @@ static ssize_t enable_3d_scaling_store(struct device *dev,
 static DEVICE_ATTR(enable_3d_scaling, S_IRUGO | S_IWUSR,
        enable_3d_scaling_show, enable_3d_scaling_store);
 
+#define MAX_FREQ_COUNT 0x40 /* 64 frequencies should be enough for anyone */
+
 void nvhost_scale3d_init(struct nvhost_device *d)
 {
        if (!scale3d.init) {
                int error;
                unsigned long max_emc, min_emc;
                long correction;
+               long rate;
+               int freqs[MAX_FREQ_COUNT];
+
                mutex_init(&scale3d.lock);
 
                INIT_WORK(&scale3d.work, scale3d_clocks_handler);
@@ -739,34 +868,74 @@ void nvhost_scale3d_init(struct nvhost_device *d)
                                POW2(scale3d.max_rate_3d - scale3d.emc_xmid);
                scale3d.emc_dip_offset -= correction;
 
+               scale3d.is_idle = 1;
+
                /* set scaling parameter defaults */
                scale3d.enable = 1;
-               scale3d.period = scale3d.p_period = 100000;
-               scale3d.idle_min = scale3d.p_idle_min = 10;
-               scale3d.idle_max = scale3d.p_idle_max = 15;
-               scale3d.fast_response = scale3d.p_fast_response = 7000;
+               scale3d.idle_min = scale3d.p_idle_min = 100;
+               scale3d.idle_max = scale3d.p_idle_max = 150;
                scale3d.p_scale_emc = 1;
                scale3d.p_emc_dip = 1;
                scale3d.p_verbosity = 0;
                scale3d.p_adjust = 1;
                scale3d.p_use_throughput_hint = 1;
-               scale3d.p_throughput_lo_limit = 95;
-               scale3d.p_throughput_hi_limit = 100;
-               scale3d.p_scale_step = 60000000;
+               scale3d.p_throughput_lower_limit = 940;
+               scale3d.p_throughput_lo_limit = 990;
+               scale3d.p_throughput_hi_limit = 1010;
+               scale3d.p_scale_step = 1;
+               scale3d.p_estimation_window = 8000;
+               scale3d.p_busy_cutoff = 750;
 
                error = device_create_file(&d->dev,
                                &dev_attr_enable_3d_scaling);
                if (error)
                        dev_err(&d->dev, "failed to create sysfs attributes");
 
+               rate = 0;
+               scale3d.freq_count = 0;
+               while (rate <= scale3d.max_rate_3d) {
+                       long rounded_rate;
+                       if (unlikely(scale3d.freq_count == MAX_FREQ_COUNT)) {
+                               pr_err("%s: too many frequencies\n", __func__);
+                               break;
+                       }
+                       rounded_rate =
+                               clk_round_rate(scale3d.clk_3d, rate);
+                       freqs[scale3d.freq_count++] = rounded_rate;
+                       rate = rounded_rate + 2000;
+               }
+               scale3d.freqlist =
+                       kmalloc(scale3d.freq_count * sizeof(int), GFP_KERNEL);
+               if (scale3d.freqlist == NULL)
+                       pr_err("%s: can\'t allocate freq table\n", __func__);
+
+               memcpy(scale3d.freqlist, freqs,
+                       scale3d.freq_count * sizeof(int));
+
+               busy_history = score_init(GR3D_FRAME_SPAN);
+               if (busy_history == NULL)
+                       pr_err("%s: can\'t init load tracking array\n",
+                              __func__);
+
+               hint_history = score_init(GR3D_FRAME_SPAN);
+               if (hint_history == NULL)
+                       pr_err("%s: can\'t init throughput tracking array\n",
+                              __func__);
+
                scale3d.init = 1;
        }
-
-       nvhost_scale3d_reset();
 }
 
 void nvhost_scale3d_deinit(struct nvhost_device *dev)
 {
        device_remove_file(&dev->dev, &dev_attr_enable_3d_scaling);
        scale3d.init = 0;
+       if (scale3d.freqlist != NULL) {
+               kfree(scale3d.freqlist);
+               scale3d.freq_count = 0;
+               scale3d.freqlist = NULL;
+       }
+
+       score_delete(busy_history);
+       score_delete(hint_history);
 }