From 0432a5c7bfc4a11531c1e58f574c733a2552f149 Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Tue, 17 May 2016 08:28:04 +0200
Subject: [PATCH 4/4] Turn into BFQ-v8r7 for 4.9.0

CHANGELOG from v8r4 to v8r7

BFQ v8r7

BUGFIX: make BFQ compile also without hierarchical support

BFQ v8r6

BUGFIX Removed the check that, when the new queue to set in service
must be selected, the cached next_in_service entities coincide with
the entities chosen by __bfq_lookup_next_entity. This check,
issuing a warning on failure, was wrong, because the cached and the
newly chosen entity could differ in case of a CLASS_IDLE timeout.

EFFICIENCY IMPROVEMENT (this improvement is related to the above
BUGFIX) The cached next_in_service entities are now really used to
select the next queue to serve when the in-service queue
expires. Before this change, the cached values were used only for
extra (and in general wrong) consistency checks. This caused
additional overhead instead of reducing it.

EFFICIENCY IMPROVEMENT The next entity to serve, for each level of the
hierarchy, is now updated on every event that may change it, i.e., on
every activation or deactivation of any entity. This finer granularity
is not strictly needed for corectness, because it is only on queue
expirations that BFQ needs to know what are the next entities to
serve. Yet this change makes it possible to implement optimizations in
which it is necessary to know the next queue to serve before the
in-service queue expires.

SERVICE-ACCURACY IMPROVEMENT The per-device CLASS_IDLE service timeout
has been turned into a much more accurate per-group timeout.

CODE-QUALITY IMPROVEMENT The non-trivial parts touched by the above
improvements have been partially rewritten, and enriched of comments,
so as to improve their transparency and understandability.

IMPROVEMENT Ported and improved CFQ commit 41647e7a
Before this improvememtn, BFQ used the same logic for detecting
seeky queues for rotational disks and SSDs. This logic is appropriate
for the former, as it takes into account only inter-request distance,
and the latter is the dominant latency factor on a rotational device.
Yet things change with flash-based devices, where serving a large
request still yields a high throughput, even the request is far
from the previous request served. This commits extends seeky
detection to take into accoutn also this fact with flash-based
devices. In particular, this commit is an improved port of the
original commit 41647e7a for CFQ.

CODE IMPROVEMENT Remove useless parameter from bfq_del_bfqq_busy

OPTIMIZATION Optimize the update of next_in_service entity.
If the update of the next_in_service candidate entity is triggered by
the activation of an entity, then it is not necessary to perform full
lookups in the active trees to update next_in_service. In fact, it is
enough to check whether the just-activated entity has a higher
priority than next_in_service, or, even if it has the same priority as
next_in_service, is eligible and has a lower virtual finish time than
next_in_service. If this compound condition holds, then the new entity
can be set as the new next_in_service. Otherwise no change is needed.
This commit implements this optimization.

BUGFIX Fix bug causing occasional loss of weight raising.
When a bfq_queue, say bfqq, is split after a merging with another
bfq_queue, BFQ checks whether it has to restore for bfqq the
weight-raising state that bfqq had before being merged.  In
particular, the weight-raising is restored only if, according to the
weight-raising duration decided for bfqq when it started to be
weight-raised (before being merged), bfqq would not have already
finished its weight-raising period.
Yet, by mistake, such a duration was not saved when bfqq is merged. So,
if bfqq was freed and reallocated when it was split, then this duration
was wrongly set to zero on the split. As a consequence, the
weight-raising state of bfqq was wrongly not restored, which caused BFQ
to fail in guaranteeing a low latency to bfqq.
This commit fixes this bug by saving weight-raising duration when bfqq
is merged, and correctly restoring it when bfqq is split.

BUGFIX Fix wrong reset of in-service entities
In-service entities were reset with an indirect logic, which
happened to be even buggy for some cases. This commit fixes
this bug in two important steps. First, by replacing this
indirect logic with a direct logic, in which all involved
entities are immediately reset, with a bubble-up loop, when
the in-service queue is reset. Second, by restructuring the
code related to this change, so as to become not only correct
with respect to this change, but also cleaner and hopefully
clearer.

CODE IMPROVEMENT Add code to be able to redirect trace log to
console.

BUGFIX Fixed bug in optimized update of next_in_service entity.
There was a case where bfq_update_next_in_service did not update
next_in_service, even if it might need to be changed: in case of
requeueing or repositioning of the entity that happened to be
pointed exactly by next_in_service. This could result in violation
of service guarantees, because, after a change of timestamps for
such an entity, it might be the case that next_in_service had to
point to a different entity. This commit fixes this bug.

OPTIMIZATION Stop bubble-up of next_in_service update if possible.

BUGFIX Fixed a false-positive warning for uninitialized var

BFQ-v8r5

DOCUMENTATION IMPROVEMENT Added documentation of BFQ
benefits, inner workings, interface and tunables.

BUGFIX: Replaced max wrongly used for modulo numbers.

DOCUMENTATION IMPROVEMENT Improved help message in
Kconfig.iosched.

BUGFIX: Removed wrong conversion in use of bfq_fifo_expire.

CODE IMPROVEMENT Added parentheses to complex macros.

Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
---
 Documentation/block/00-INDEX        |    2 +
 Documentation/block/bfq-iosched.txt |  530 ++++++
 block/Kconfig.iosched               |   18 +-
 block/bfq-cgroup.c                  |  501 +++---
 block/bfq-iosched.c                 | 3278 ++++++++++++++++++++++-------------
 block/bfq-sched.c                   | 1288 +++++++++++---
 block/bfq.h                         |  800 +++++----
 7 files changed, 4320 insertions(+), 2097 deletions(-)
 create mode 100644 Documentation/block/bfq-iosched.txt

diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
index e55103a..8d55b4b 100644
--- a/Documentation/block/00-INDEX
+++ b/Documentation/block/00-INDEX
@@ -1,5 +1,7 @@
 00-INDEX
 	- This file
+bfq-iosched.txt
+	- BFQ IO scheduler and its tunables
 biodoc.txt
 	- Notes on the Generic Block Layer Rewrite in Linux 2.5
 biovecs.txt
diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
new file mode 100644
index 0000000..13b5248
--- /dev/null
+++ b/Documentation/block/bfq-iosched.txt
@@ -0,0 +1,530 @@
+BFQ (Budget Fair Queueing)
+==========================
+
+BFQ is a proportional-share I/O scheduler, with some extra
+low-latency capabilities. In addition to cgroups support (blkio or io
+controllers), BFQ's main features are:
+- BFQ guarantees a high system and application responsiveness, and a
+  low latency for time-sensitive applications, such as audio or video
+  players;
+- BFQ distributes bandwidth, and not just time, among processes or
+  groups (switching back to time distribution when needed to keep
+  throughput high).
+
+On average CPUs, the current version of BFQ can handle devices
+performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a
+reference, 30-50 KIOPS correspond to very high bandwidths with
+sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and
+to 120-200 MB/s with 4KB random I/O.
+
+The table of contents follow. Impatients can just jump to Section 3.
+
+CONTENTS
+
+1. When may BFQ be useful?
+ 1-1 Personal systems
+ 1-2 Server systems
+2. How does BFQ work?
+3. What are BFQ's tunable?
+4. BFQ group scheduling
+ 4-1 Service guarantees provided
+ 4-2 Interface
+
+1. When may BFQ be useful?
+==========================
+
+BFQ provides the following benefits on personal and server systems.
+
+1-1 Personal systems
+--------------------
+
+Low latency for interactive applications
+
+Regardless of the actual background workload, BFQ guarantees that, for
+interactive tasks, the storage device is virtually as responsive as if
+it was idle. For example, even if one or more of the following
+background workloads are being executed:
+- one or more large files are being read, written or copied,
+- a tree of source files is being compiled,
+- one or more virtual machines are performing I/O,
+- a software update is in progress,
+- indexing daemons are scanning filesystems and updating their
+  databases,
+starting an application or loading a file from within an application
+takes about the same time as if the storage device was idle. As a
+comparison, with CFQ, NOOP or DEADLINE, and in the same conditions,
+applications experience high latencies, or even become unresponsive
+until the background workload terminates (also on SSDs).
+
+Low latency for soft real-time applications
+
+Also soft real-time applications, such as audio and video
+players/streamers, enjoy a low latency and a low drop rate, regardless
+of the background I/O workload. As a consequence, these applications
+do not suffer from almost any glitch due to the background workload.
+
+Higher speed for code-development tasks
+
+If some additional workload happens to be executed in parallel, then
+BFQ executes the I/O-related components of typical code-development
+tasks (compilation, checkout, merge, ...) much more quickly than CFQ,
+NOOP or DEADLINE.
+
+High throughput
+
+On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and
+up to 150% higher throughput than DEADLINE and NOOP, with all the
+sequential workloads considered in our tests. With random workloads,
+and with all the workloads on flash-based devices, BFQ achieves,
+instead, about the same throughput as the other schedulers.
+
+Strong fairness, bandwidth and delay guarantees
+
+BFQ distributes the device throughput, and not just the device time,
+among I/O-bound applications in proportion their weights, with any
+workload and regardless of the device parameters. From these bandwidth
+guarantees, it is possible to compute tight per-I/O-request delay
+guarantees by a simple formula. If not configured for strict service
+guarantees, BFQ switches to time-based resource sharing (only) for
+applications that would otherwise cause a throughput loss.
+
+1-2 Server systems
+------------------
+
+Most benefits for server systems follow from the same service
+properties as above. In particular, regardless of whether additional,
+possibly heavy workloads are being served, BFQ guarantees:
+
+. audio and video-streaming with zero or very low jitter and drop
+  rate;
+
+. fast retrieval of WEB pages and embedded objects;
+
+. real-time recording of data in live-dumping applications (e.g.,
+  packet logging);
+
+. responsiveness in local and remote access to a server.
+
+
+2. How does BFQ work?
+=====================
+
+BFQ is a proportional-share I/O scheduler, whose general structure,
+plus a lot of code, are borrowed from CFQ.
+
+- Each process doing I/O on a device is associated with a weight and a
+  (bfq_)queue.
+
+- BFQ grants exclusive access to the device, for a while, to one queue
+  (process) at a time, and implements this service model by
+  associating every queue with a budget, measured in number of
+  sectors.
+
+  - After a queue is granted access to the device, the budget of the
+    queue is decremented, on each request dispatch, by the size of the
+    request.
+
+  - The in-service queue is expired, i.e., its service is suspended,
+    only if one of the following events occurs: 1) the queue finishes
+    its budget, 2) the queue empties, 3) a "budget timeout" fires.
+
+    - The budget timeout prevents processes doing random I/O from
+      holding the device for too long and dramatically reducing
+      throughput.
+
+    - Actually, as in CFQ, a queue associated with a process issuing
+      sync requests may not be expired immediately when it empties. In
+      contrast, BFQ may idle the device for a short time interval,
+      giving the process the chance to go on being served if it issues
+      a new request in time. Device idling typically boosts the
+      throughput on rotational devices, if processes do synchronous
+      and sequential I/O. In addition, under BFQ, device idling is
+      also instrumental in guaranteeing the desired throughput
+      fraction to processes issuing sync requests (see the description
+      of the slice_idle tunable in this document, or [1, 2], for more
+      details).
+
+      - With respect to idling for service guarantees, if several
+	processes are competing for the device at the same time, but
+	all processes (and groups, after the following commit) have
+	the same weight, then BFQ guarantees the expected throughput
+	distribution without ever idling the device. Throughput is
+	thus as high as possible in this common scenario.
+
+  - If low-latency mode is enabled (default configuration), BFQ
+    executes some special heuristics to detect interactive and soft
+    real-time applications (e.g., video or audio players/streamers),
+    and to reduce their latency. The most important action taken to
+    achieve this goal is to give to the queues associated with these
+    applications more than their fair share of the device
+    throughput. For brevity, we call just "weight-raising" the whole
+    sets of actions taken by BFQ to privilege these queues. In
+    particular, BFQ provides a milder form of weight-raising for
+    interactive applications, and a stronger form for soft real-time
+    applications.
+
+  - BFQ automatically deactivates idling for queues born in a burst of
+    queue creations. In fact, these queues are usually associated with
+    the processes of applications and services that benefit mostly
+    from a high throughput. Examples are systemd during boot, or git
+    grep.
+
+  - As CFQ, BFQ merges queues performing interleaved I/O, i.e.,
+    performing random I/O that becomes mostly sequential if
+    merged. Differently from CFQ, BFQ achieves this goal with a more
+    reactive mechanism, called Early Queue Merge (EQM). EQM is so
+    responsive in detecting interleaved I/O (cooperating processes),
+    that it enables BFQ to achieve a high throughput, by queue
+    merging, even for queues for which CFQ needs a different
+    mechanism, preemption, to get a high throughput. As such EQM is a
+    unified mechanism to achieve a high throughput with interleaved
+    I/O.
+
+  - Queues are scheduled according to a variant of WF2Q+, named
+    B-WF2Q+, and implemented using an augmented rb-tree to preserve an
+    O(log N) overall complexity.  See [2] for more details. B-WF2Q+ is
+    also ready for hierarchical scheduling. However, for a cleaner
+    logical breakdown, the code that enables and completes
+    hierarchical support is provided in the next commit, which focuses
+    exactly on this feature.
+
+  - B-WF2Q+ guarantees a tight deviation with respect to an ideal,
+    perfectly fair, and smooth service. In particular, B-WF2Q+
+    guarantees that each queue receives a fraction of the device
+    throughput proportional to its weight, even if the throughput
+    fluctuates, and regardless of: the device parameters, the current
+    workload and the budgets assigned to the queue.
+
+  - The last, budget-independence, property (although probably
+    counterintuitive in the first place) is definitely beneficial, for
+    the following reasons:
+
+    - First, with any proportional-share scheduler, the maximum
+      deviation with respect to an ideal service is proportional to
+      the maximum budget (slice) assigned to queues. As a consequence,
+      BFQ can keep this deviation tight not only because of the
+      accurate service of B-WF2Q+, but also because BFQ *does not*
+      need to assign a larger budget to a queue to let the queue
+      receive a higher fraction of the device throughput.
+
+    - Second, BFQ is free to choose, for every process (queue), the
+      budget that best fits the needs of the process, or best
+      leverages the I/O pattern of the process. In particular, BFQ
+      updates queue budgets with a simple feedback-loop algorithm that
+      allows a high throughput to be achieved, while still providing
+      tight latency guarantees to time-sensitive applications. When
+      the in-service queue expires, this algorithm computes the next
+      budget of the queue so as to:
+
+      - Let large budgets be eventually assigned to the queues
+	associated with I/O-bound applications performing sequential
+	I/O: in fact, the longer these applications are served once
+	got access to the device, the higher the throughput is.
+
+      - Let small budgets be eventually assigned to the queues
+	associated with time-sensitive applications (which typically
+	perform sporadic and short I/O), because, the smaller the
+	budget assigned to a queue waiting for service is, the sooner
+	B-WF2Q+ will serve that queue (Subsec 3.3 in [2]).
+
+- If several processes are competing for the device at the same time,
+  but all processes and groups have the same weight, then BFQ
+  guarantees the expected throughput distribution without ever idling
+  the device. It uses preemption instead. Throughput is then much
+  higher in this common scenario.
+
+- ioprio classes are served in strict priority order, i.e.,
+  lower-priority queues are not served as long as there are
+  higher-priority queues.  Among queues in the same class, the
+  bandwidth is distributed in proportion to the weight of each
+  queue. A very thin extra bandwidth is however guaranteed to
+  the Idle class, to prevent it from starving.
+
+
+3. What are BFQ's tunable?
+==========================
+
+The tunables back_seek-max, back_seek_penalty, fifo_expire_async and
+fifo_expire_sync below are the same as in CFQ. Their description is
+just copied from that for CFQ. Some considerations in the description
+of slice_idle are copied from CFQ too.
+
+per-process ioprio and weight
+-----------------------------
+
+Unless the cgroups interface is used (see "4. BFQ group scheduling"),
+weights can be assigned to processes only indirectly, through I/O
+priorities, and according to the relation:
+weight = (IOPRIO_BE_NR - ioprio) * 10.
+
+Beware that, if low-latency is set, then BFQ automatically raises the
+weight of the queues associated with interactive and soft real-time
+applications. Unset this tunable if you need/want to control weights.
+
+slice_idle
+----------
+
+This parameter specifies how long BFQ should idle for next I/O
+request, when certain sync BFQ queues become empty. By default
+slice_idle is a non-zero value. Idling has a double purpose: boosting
+throughput and making sure that the desired throughput distribution is
+respected (see the description of how BFQ works, and, if needed, the
+papers referred there).
+
+As for throughput, idling can be very helpful on highly seeky media
+like single spindle SATA/SAS disks where we can cut down on overall
+number of seeks and see improved throughput.
+
+Setting slice_idle to 0 will remove all the idling on queues and one
+should see an overall improved throughput on faster storage devices
+like multiple SATA/SAS disks in hardware RAID configuration.
+
+So depending on storage and workload, it might be useful to set
+slice_idle=0.  In general for SATA/SAS disks and software RAID of
+SATA/SAS disks keeping slice_idle enabled should be useful. For any
+configurations where there are multiple spindles behind single LUN
+(Host based hardware RAID controller or for storage arrays), setting
+slice_idle=0 might end up in better throughput and acceptable
+latencies.
+
+Idling is however necessary to have service guarantees enforced in
+case of differentiated weights or differentiated I/O-request lengths.
+To see why, suppose that a given BFQ queue A must get several I/O
+requests served for each request served for another queue B. Idling
+ensures that, if A makes a new I/O request slightly after becoming
+empty, then no request of B is dispatched in the middle, and thus A
+does not lose the possibility to get more than one request dispatched
+before the next request of B is dispatched. Note that idling
+guarantees the desired differentiated treatment of queues only in
+terms of I/O-request dispatches. To guarantee that the actual service
+order then corresponds to the dispatch order, the strict_guarantees
+tunable must be set too.
+
+There is an important flipside for idling: apart from the above cases
+where it is beneficial also for throughput, idling can severely impact
+throughput. One important case is random workload. Because of this
+issue, BFQ tends to avoid idling as much as possible, when it is not
+beneficial also for throughput. As a consequence of this behavior, and
+of further issues described for the strict_guarantees tunable,
+short-term service guarantees may be occasionally violated. And, in
+some cases, these guarantees may be more important than guaranteeing
+maximum throughput. For example, in video playing/streaming, a very
+low drop rate may be more important than maximum throughput. In these
+cases, consider setting the strict_guarantees parameter.
+
+strict_guarantees
+-----------------
+
+If this parameter is set (default: unset), then BFQ
+
+- always performs idling when the in-service queue becomes empty;
+
+- forces the device to serve one I/O request at a time, by dispatching a
+  new request only if there is no outstanding request.
+
+In the presence of differentiated weights or I/O-request sizes, both
+the above conditions are needed to guarantee that every BFQ queue
+receives its allotted share of the bandwidth. The first condition is
+needed for the reasons explained in the description of the slice_idle
+tunable.  The second condition is needed because all modern storage
+devices reorder internally-queued requests, which may trivially break
+the service guarantees enforced by the I/O scheduler.
+
+Setting strict_guarantees may evidently affect throughput.
+
+back_seek_max
+-------------
+
+This specifies, given in Kbytes, the maximum "distance" for backward seeking.
+The distance is the amount of space from the current head location to the
+sectors that are backward in terms of distance.
+
+This parameter allows the scheduler to anticipate requests in the "backward"
+direction and consider them as being the "next" if they are within this
+distance from the current head location.
+
+back_seek_penalty
+-----------------
+
+This parameter is used to compute the cost of backward seeking. If the
+backward distance of request is just 1/back_seek_penalty from a "front"
+request, then the seeking cost of two requests is considered equivalent.
+
+So scheduler will not bias toward one or the other request (otherwise scheduler
+will bias toward front request). Default value of back_seek_penalty is 2.
+
+fifo_expire_async
+-----------------
+
+This parameter is used to set the timeout of asynchronous requests. Default
+value of this is 248ms.
+
+fifo_expire_sync
+----------------
+
+This parameter is used to set the timeout of synchronous requests. Default
+value of this is 124ms. In case to favor synchronous requests over asynchronous
+one, this value should be decreased relative to fifo_expire_async.
+
+low_latency
+-----------
+
+This parameter is used to enable/disable BFQ's low latency mode. By
+default, low latency mode is enabled. If enabled, interactive and soft
+real-time applications are privileged and experience a lower latency,
+as explained in more detail in the description of how BFQ works.
+
+DO NOT enable this mode if you need full control on bandwidth
+distribution. In fact, if it is enabled, then BFQ automatically
+increases the bandwidth share of privileged applications, as the main
+means to guarantee a lower latency to them.
+
+timeout_sync
+------------
+
+Maximum amount of device time that can be given to a task (queue) once
+it has been selected for service. On devices with costly seeks,
+increasing this time usually increases maximum throughput. On the
+opposite end, increasing this time coarsens the granularity of the
+short-term bandwidth and latency guarantees, especially if the
+following parameter is set to zero.
+
+max_budget
+----------
+
+Maximum amount of service, measured in sectors, that can be provided
+to a BFQ queue once it is set in service (of course within the limits
+of the above timeout). According to what said in the description of
+the algorithm, larger values increase the throughput in proportion to
+the percentage of sequential I/O requests issued. The price of larger
+values is that they coarsen the granularity of short-term bandwidth
+and latency guarantees.
+
+The default value is 0, which enables auto-tuning: BFQ sets max_budget
+to the maximum number of sectors that can be served during
+timeout_sync, according to the estimated peak rate.
+
+weights
+-------
+
+Read-only parameter, used to show the weights of the currently active
+BFQ queues.
+
+
+wr_ tunables
+------------
+
+BFQ exports a few parameters to control/tune the behavior of
+low-latency heuristics.
+
+wr_coeff
+
+Factor by which the weight of a weight-raised queue is multiplied. If
+the queue is deemed soft real-time, then the weight is further
+multiplied by an additional, constant factor.
+
+wr_max_time
+
+Maximum duration of a weight-raising period for an interactive task
+(ms). If set to zero (default value), then this value is computed
+automatically, as a function of the peak rate of the device. In any
+case, when the value of this parameter is read, it always reports the
+current duration, regardless of whether it has been set manually or
+computed automatically.
+
+wr_max_softrt_rate
+
+Maximum service rate below which a queue is deemed to be associated
+with a soft real-time application, and is then weight-raised
+accordingly (sectors/sec).
+
+wr_min_idle_time
+
+Minimum idle period after which interactive weight-raising may be
+reactivated for a queue (in ms).
+
+wr_rt_max_time
+
+Maximum weight-raising duration for soft real-time queues (in ms). The
+start time from which this duration is considered is automatically
+moved forward if the queue is detected to be still soft real-time
+before the current soft real-time weight-raising period finishes.
+
+wr_min_inter_arr_async
+
+Minimum period between I/O request arrivals after which weight-raising
+may be reactivated for an already busy async queue (in ms).
+
+
+4. Group scheduling with BFQ
+============================
+
+BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely
+blkio and io. In particular, BFQ supports weight-based proportional
+share. To activate cgroups support, set BFQ_GROUP_IOSCHED.
+
+4-1 Service guarantees provided
+-------------------------------
+
+With BFQ, proportional share means true proportional share of the
+device bandwidth, according to group weights. For example, a group
+with weight 200 gets twice the bandwidth, and not just twice the time,
+of a group with weight 100.
+
+BFQ supports hierarchies (group trees) of any depth. Bandwidth is
+distributed among groups and processes in the expected way: for each
+group, the children of the group share the whole bandwidth of the
+group in proportion to their weights. In particular, this implies
+that, for each leaf group, every process of the group receives the
+same share of the whole group bandwidth, unless the ioprio of the
+process is modified.
+
+The resource-sharing guarantee for a group may partially or totally
+switch from bandwidth to time, if providing bandwidth guarantees to
+the group lowers the throughput too much. This switch occurs on a
+per-process basis: if a process of a leaf group causes throughput loss
+if served in such a way to receive its share of the bandwidth, then
+BFQ switches back to just time-based proportional share for that
+process.
+
+4-2 Interface
+-------------
+
+To get proportional sharing of bandwidth with BFQ for a given device,
+BFQ must of course be the active scheduler for that device.
+
+Within each group directory, the names of the files associated with
+BFQ-specific cgroup parameters and stats begin with the "bfq."
+prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for
+BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group
+parameter to set the weight of a group with BFQ is blkio.bfq.weight
+or io.bfq.weight.
+
+Parameters to set
+-----------------
+
+For each group, there is only the following parameter to set.
+
+weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the
+group inside its parent. Available values: 1..10000 (default 100). The
+linear mapping between ioprio and weights, described at the beginning
+of the tunable section, is still valid, but all weights higher than
+IOPRIO_BE_NR*10 are mapped to ioprio 0.
+
+Recall that, if low-latency is set, then BFQ automatically raises the
+weight of the queues associated with interactive and soft real-time
+applications. Unset this tunable if you need/want to control weights.
+
+
+[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
+    Scheduler", Proceedings of the First Workshop on Mobile System
+    Technologies (MST-2015), May 2015.
+    http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
+
+[2] P. Valente and M. Andreolini, "Improving Application
+    Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of
+    the 5th Annual International Systems and Storage Conference
+    (SYSTOR '12), June 2012.
+    Slightly extended version:
+    http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
+							results.pdf
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index f78cd1a..f2cd945 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -43,20 +43,20 @@ config IOSCHED_BFQ
 	tristate "BFQ I/O scheduler"
 	default n
 	---help---
-	  The BFQ I/O scheduler tries to distribute bandwidth among
-	  all processes according to their weights.
-	  It aims at distributing the bandwidth as desired, independently of
-	  the disk parameters and with any workload. It also tries to
-	  guarantee low latency to interactive and soft real-time
-	  applications. If compiled built-in (saying Y here), BFQ can
-	  be configured to support hierarchical scheduling.
+	The BFQ I/O scheduler distributes bandwidth among all
+	processes according to their weights, regardless of the
+	device parameters and with any workload. It also guarantees
+	a low latency to interactive and soft real-time applications.
+	Details in Documentation/block/bfq-iosched.txt
 
 config BFQ_GROUP_IOSCHED
 	bool "BFQ hierarchical scheduling support"
-	depends on CGROUPS && IOSCHED_BFQ=y
+	depends on IOSCHED_BFQ && BLK_CGROUP
 	default n
 	---help---
-	  Enable hierarchical scheduling in BFQ, using the blkio controller.
+
+	Enable hierarchical scheduling in BFQ, using the blkio
+	(cgroups-v1) or io (cgroups-v2) controller.
 
 choice
 	prompt "Default I/O scheduler"
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 0367996..bbaecd0 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -7,7 +7,9 @@
  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
  *		      Paolo Valente <paolo.valente@unimore.it>
  *
- * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org>
  *
  * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
  * file.
@@ -163,8 +165,6 @@ static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
 {
 	struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq);
 
-	BUG_ON(!pd);
-
 	return pd_to_bfqg(pd);
 }
 
@@ -208,59 +208,49 @@ static void bfqg_put(struct bfq_group *bfqg)
 
 static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
 				     struct bfq_queue *bfqq,
-				     int rw)
+				     int op, int op_flags)
 {
-	blkg_rwstat_add(&bfqg->stats.queued, rw, 1);
+	blkg_rwstat_add(&bfqg->stats.queued, op, op_flags, 1);
 	bfqg_stats_end_empty_time(&bfqg->stats);
 	if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
 		bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
 }
 
-static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw)
-{
-	blkg_rwstat_add(&bfqg->stats.queued, rw, -1);
-}
-
-static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw)
+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int op,
+					int op_flags)
 {
-	blkg_rwstat_add(&bfqg->stats.merged, rw, 1);
+	blkg_rwstat_add(&bfqg->stats.queued, op, op_flags, -1);
 }
 
-static void bfqg_stats_update_dispatch(struct bfq_group *bfqg,
-					      uint64_t bytes, int rw)
+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg,  int op,
+					int op_flags)
 {
-	blkg_stat_add(&bfqg->stats.sectors, bytes >> 9);
-	blkg_rwstat_add(&bfqg->stats.serviced, rw, 1);
-	blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes);
+	blkg_rwstat_add(&bfqg->stats.merged, op, op_flags, 1);
 }
 
 static void bfqg_stats_update_completion(struct bfq_group *bfqg,
-			uint64_t start_time, uint64_t io_start_time, int rw)
+			uint64_t start_time, uint64_t io_start_time, int op,
+			int op_flags)
 {
 	struct bfqg_stats *stats = &bfqg->stats;
 	unsigned long long now = sched_clock();
 
 	if (time_after64(now, io_start_time))
-		blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
+		blkg_rwstat_add(&stats->service_time, op, op_flags,
+				now - io_start_time);
 	if (time_after64(io_start_time, start_time))
-		blkg_rwstat_add(&stats->wait_time, rw,
+		blkg_rwstat_add(&stats->wait_time, op, op_flags,
 				io_start_time - start_time);
 }
 
 /* @stats = 0 */
 static void bfqg_stats_reset(struct bfqg_stats *stats)
 {
-	if (!stats)
-		return;
-
 	/* queued stats shouldn't be cleared */
-	blkg_rwstat_reset(&stats->service_bytes);
-	blkg_rwstat_reset(&stats->serviced);
 	blkg_rwstat_reset(&stats->merged);
 	blkg_rwstat_reset(&stats->service_time);
 	blkg_rwstat_reset(&stats->wait_time);
 	blkg_stat_reset(&stats->time);
-	blkg_stat_reset(&stats->unaccounted_time);
 	blkg_stat_reset(&stats->avg_queue_size_sum);
 	blkg_stat_reset(&stats->avg_queue_size_samples);
 	blkg_stat_reset(&stats->dequeue);
@@ -270,19 +260,16 @@ static void bfqg_stats_reset(struct bfqg_stats *stats)
 }
 
 /* @to += @from */
-static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from)
+static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
 {
 	if (!to || !from)
 		return;
 
 	/* queued stats shouldn't be cleared */
-	blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes);
-	blkg_rwstat_add_aux(&to->serviced, &from->serviced);
 	blkg_rwstat_add_aux(&to->merged, &from->merged);
 	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
 	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
 	blkg_stat_add_aux(&from->time, &from->time);
-	blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
 	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
 	blkg_stat_add_aux(&to->avg_queue_size_samples,
 			  &from->avg_queue_size_samples);
@@ -311,10 +298,8 @@ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
 	if (unlikely(!parent))
 		return;
 
-	bfqg_stats_merge(&parent->dead_stats, &bfqg->stats);
-	bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats);
+	bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
 	bfqg_stats_reset(&bfqg->stats);
-	bfqg_stats_reset(&bfqg->dead_stats);
 }
 
 static void bfq_init_entity(struct bfq_entity *entity,
@@ -329,21 +314,17 @@ static void bfq_init_entity(struct bfq_entity *entity,
 		bfqq->ioprio_class = bfqq->new_ioprio_class;
 		bfqg_get(bfqg);
 	}
-	entity->parent = bfqg->my_entity;
+	entity->parent = bfqg->my_entity; /* NULL for root group */
 	entity->sched_data = &bfqg->sched_data;
 }
 
 static void bfqg_stats_exit(struct bfqg_stats *stats)
 {
-	blkg_rwstat_exit(&stats->service_bytes);
-	blkg_rwstat_exit(&stats->serviced);
 	blkg_rwstat_exit(&stats->merged);
 	blkg_rwstat_exit(&stats->service_time);
 	blkg_rwstat_exit(&stats->wait_time);
 	blkg_rwstat_exit(&stats->queued);
-	blkg_stat_exit(&stats->sectors);
 	blkg_stat_exit(&stats->time);
-	blkg_stat_exit(&stats->unaccounted_time);
 	blkg_stat_exit(&stats->avg_queue_size_sum);
 	blkg_stat_exit(&stats->avg_queue_size_samples);
 	blkg_stat_exit(&stats->dequeue);
@@ -354,15 +335,11 @@ static void bfqg_stats_exit(struct bfqg_stats *stats)
 
 static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
 {
-	if (blkg_rwstat_init(&stats->service_bytes, gfp) ||
-	    blkg_rwstat_init(&stats->serviced, gfp) ||
-	    blkg_rwstat_init(&stats->merged, gfp) ||
+	if (blkg_rwstat_init(&stats->merged, gfp) ||
 	    blkg_rwstat_init(&stats->service_time, gfp) ||
 	    blkg_rwstat_init(&stats->wait_time, gfp) ||
 	    blkg_rwstat_init(&stats->queued, gfp) ||
-	    blkg_stat_init(&stats->sectors, gfp) ||
 	    blkg_stat_init(&stats->time, gfp) ||
-	    blkg_stat_init(&stats->unaccounted_time, gfp) ||
 	    blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
 	    blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
 	    blkg_stat_init(&stats->dequeue, gfp) ||
@@ -386,11 +363,27 @@ static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
 	return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
 }
 
+static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
+{
+	struct bfq_group_data *bgd;
+
+	bgd = kzalloc(sizeof(*bgd), GFP_KERNEL);
+	if (!bgd)
+		return NULL;
+	return &bgd->pd;
+}
+
 static void bfq_cpd_init(struct blkcg_policy_data *cpd)
 {
 	struct bfq_group_data *d = cpd_to_bfqgd(cpd);
 
-	d->weight = BFQ_DEFAULT_GRP_WEIGHT;
+	d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
+		CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
+}
+
+static void bfq_cpd_free(struct blkcg_policy_data *cpd)
+{
+	kfree(cpd_to_bfqgd(cpd));
 }
 
 static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
@@ -401,8 +394,7 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
 	if (!bfqg)
 		return NULL;
 
-	if (bfqg_stats_init(&bfqg->stats, gfp) ||
-	    bfqg_stats_init(&bfqg->dead_stats, gfp)) {
+	if (bfqg_stats_init(&bfqg->stats, gfp)) {
 		kfree(bfqg);
 		return NULL;
 	}
@@ -410,27 +402,20 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
 	return &bfqg->pd;
 }
 
-static void bfq_group_set_parent(struct bfq_group *bfqg,
-					struct bfq_group *parent)
+static void bfq_pd_init(struct blkg_policy_data *pd)
 {
+	struct blkcg_gq *blkg;
+	struct bfq_group *bfqg;
+	struct bfq_data *bfqd;
 	struct bfq_entity *entity;
+	struct bfq_group_data *d;
 
-	BUG_ON(!parent);
-	BUG_ON(!bfqg);
-	BUG_ON(bfqg == parent);
-
+	blkg = pd_to_blkg(pd);
+	BUG_ON(!blkg);
+	bfqg = blkg_to_bfqg(blkg);
+	bfqd = blkg->q->elevator->elevator_data;
 	entity = &bfqg->entity;
-	entity->parent = parent->my_entity;
-	entity->sched_data = &parent->sched_data;
-}
-
-static void bfq_pd_init(struct blkg_policy_data *pd)
-{
-	struct blkcg_gq *blkg = pd_to_blkg(pd);
-	struct bfq_group *bfqg = blkg_to_bfqg(blkg);
-	struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
-	struct bfq_entity *entity = &bfqg->entity;
-	struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
+	d = blkcg_to_bfqgd(blkg->blkcg);
 
 	entity->orig_weight = entity->weight = entity->new_weight = d->weight;
 	entity->my_sched_data = &bfqg->sched_data;
@@ -448,70 +433,53 @@ static void bfq_pd_free(struct blkg_policy_data *pd)
 	struct bfq_group *bfqg = pd_to_bfqg(pd);
 
 	bfqg_stats_exit(&bfqg->stats);
-	bfqg_stats_exit(&bfqg->dead_stats);
-
 	return kfree(bfqg);
 }
 
-/* offset delta from bfqg->stats to bfqg->dead_stats */
-static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) -
-					offsetof(struct bfq_group, stats);
-
-/* to be used by recursive prfill, sums live and dead stats recursively */
-static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
+static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
 {
-	u64 sum = 0;
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
 
-	sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off);
-	sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq,
-				       off + dead_stats_off_delta);
-	return sum;
+	bfqg_stats_reset(&bfqg->stats);
 }
 
-/* to be used by recursive prfill, sums live and dead rwstats recursively */
-static struct blkg_rwstat
-bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
+static void bfq_group_set_parent(struct bfq_group *bfqg,
+					struct bfq_group *parent)
 {
-	struct blkg_rwstat a, b;
+	struct bfq_entity *entity;
+
+	BUG_ON(!parent);
+	BUG_ON(!bfqg);
+	BUG_ON(bfqg == parent);
 
-	a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off);
-	b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq,
-				      off + dead_stats_off_delta);
-	blkg_rwstat_add_aux(&a, &b);
-	return a;
+	entity = &bfqg->entity;
+	entity->parent = parent->my_entity;
+	entity->sched_data = &parent->sched_data;
 }
 
-static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
+static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
+					 struct blkcg *blkcg)
 {
-	struct bfq_group *bfqg = pd_to_bfqg(pd);
+	struct blkcg_gq *blkg;
 
-	bfqg_stats_reset(&bfqg->stats);
-	bfqg_stats_reset(&bfqg->dead_stats);
+	blkg = blkg_lookup(blkcg, bfqd->queue);
+	if (likely(blkg))
+		return blkg_to_bfqg(blkg);
+	return NULL;
 }
 
-static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
-					      struct blkcg *blkcg)
+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
+					    struct blkcg *blkcg)
 {
-	struct request_queue *q = bfqd->queue;
-	struct bfq_group *bfqg = NULL, *parent;
-	struct bfq_entity *entity = NULL;
+	struct bfq_group *bfqg, *parent;
+	struct bfq_entity *entity;
 
 	assert_spin_locked(bfqd->queue->queue_lock);
 
-	/* avoid lookup for the common case where there's no blkcg */
-	if (blkcg == &blkcg_root) {
-		bfqg = bfqd->root_group;
-	} else {
-		struct blkcg_gq *blkg;
-
-		blkg = blkg_lookup_create(blkcg, q);
-		if (!IS_ERR(blkg))
-			bfqg = blkg_to_bfqg(blkg);
-		else /* fallback to root_group */
-			bfqg = bfqd->root_group;
-	}
+	bfqg = bfq_lookup_bfqg(bfqd, blkcg);
 
-	BUG_ON(!bfqg);
+	if (unlikely(!bfqg))
+		return NULL;
 
 	/*
 	 * Update chain of bfq_groups as we might be handling a leaf group
@@ -537,11 +505,15 @@ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
 static void bfq_pos_tree_add_move(struct bfq_data *bfqd,
 				  struct bfq_queue *bfqq);
 
+static void bfq_bfqq_expire(struct bfq_data *bfqd,
+			    struct bfq_queue *bfqq,
+			    bool compensate,
+			    enum bfqq_expiration reason);
+
 /**
  * bfq_bfqq_move - migrate @bfqq to @bfqg.
  * @bfqd: queue descriptor.
  * @bfqq: the queue to move.
- * @entity: @bfqq's entity.
  * @bfqg: the group to move to.
  *
  * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
@@ -552,26 +524,40 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd,
  * rcu_read_lock()).
  */
 static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			  struct bfq_entity *entity, struct bfq_group *bfqg)
+			  struct bfq_group *bfqg)
 {
-	int busy, resume;
+	struct bfq_entity *entity = &bfqq->entity;
 
-	busy = bfq_bfqq_busy(bfqq);
-	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
-
-	BUG_ON(resume && !entity->on_st);
-	BUG_ON(busy && !resume && entity->on_st &&
+	BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list));
+	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st);
+	BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list)
+	       && entity->on_st &&
 	       bfqq != bfqd->in_service_queue);
+	BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue);
+
+	/* If bfqq is empty, then bfq_bfqq_expire also invokes
+	 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
+	 * from data structures related to current group. Otherwise we
+	 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
+	 * we do below.
+	 */
+	if (bfqq == bfqd->in_service_queue)
+		bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
+				false, BFQ_BFQQ_PREEMPTED);
 
-	if (busy) {
-		BUG_ON(atomic_read(&bfqq->ref) < 2);
+	BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq)
+	    && &bfq_entity_service_tree(entity)->idle !=
+	       entity->tree);
 
-		if (!resume)
-			bfq_del_bfqq_busy(bfqd, bfqq, 0);
-		else
-			bfq_deactivate_bfqq(bfqd, bfqq, 0);
-	} else if (entity->on_st)
+	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq));
+
+	if (bfq_bfqq_busy(bfqq))
+		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
+	else if (entity->on_st) {
+		BUG_ON(&bfq_entity_service_tree(entity)->idle !=
+		       entity->tree);
 		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
+	}
 	bfqg_put(bfqq_group(bfqq));
 
 	/*
@@ -583,14 +569,17 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	entity->sched_data = &bfqg->sched_data;
 	bfqg_get(bfqg);
 
-	if (busy) {
+	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq));
+	if (bfq_bfqq_busy(bfqq)) {
 		bfq_pos_tree_add_move(bfqd, bfqq);
-		if (resume)
-			bfq_activate_bfqq(bfqd, bfqq);
+		bfq_activate_bfqq(bfqd, bfqq);
 	}
 
 	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
 		bfq_schedule_dispatch(bfqd);
+	BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq)
+	       && &bfq_entity_service_tree(entity)->idle !=
+	       entity->tree);
 }
 
 /**
@@ -617,7 +606,11 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
 
 	lockdep_assert_held(bfqd->queue->queue_lock);
 
-	bfqg = bfq_find_alloc_group(bfqd, blkcg);
+	bfqg = bfq_find_set_group(bfqd, blkcg);
+
+	if (unlikely(!bfqg))
+		bfqg = bfqd->root_group;
+
 	if (async_bfqq) {
 		entity = &async_bfqq->entity;
 
@@ -625,7 +618,8 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
 			bic_set_bfqq(bic, NULL, 0);
 			bfq_log_bfqq(bfqd, async_bfqq,
 				     "bic_change_group: %p %d",
-				     async_bfqq, atomic_read(&async_bfqq->ref));
+				     async_bfqq,
+				     async_bfqq->ref);
 			bfq_put_queue(async_bfqq);
 		}
 	}
@@ -633,7 +627,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
 	if (sync_bfqq) {
 		entity = &sync_bfqq->entity;
 		if (entity->sched_data != &bfqg->sched_data)
-			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
+			bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
 	}
 
 	return bfqg;
@@ -642,25 +636,23 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
 static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
 {
 	struct bfq_data *bfqd = bic_to_bfqd(bic);
-	struct blkcg *blkcg;
 	struct bfq_group *bfqg = NULL;
-	uint64_t id;
+	uint64_t serial_nr;
 
 	rcu_read_lock();
-	blkcg = bio_blkcg(bio);
-	id = blkcg->css.serial_nr;
-	rcu_read_unlock();
+	serial_nr = bio_blkcg(bio)->css.serial_nr;
 
 	/*
 	 * Check whether blkcg has changed.  The condition may trigger
 	 * spuriously on a newly created cic but there's no harm.
 	 */
-	if (unlikely(!bfqd) || likely(bic->blkcg_id == id))
-		return;
+	if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
+		goto out;
 
-	bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg);
-	BUG_ON(!bfqg);
-	bic->blkcg_id = id;
+	bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
+	bic->blkcg_serial_nr = serial_nr;
+out:
+	rcu_read_unlock();
 }
 
 /**
@@ -672,7 +664,7 @@ static void bfq_flush_idle_tree(struct bfq_service_tree *st)
 	struct bfq_entity *entity = st->first_idle;
 
 	for (; entity ; entity = st->first_idle)
-		__bfq_deactivate_entity(entity, 0);
+		__bfq_deactivate_entity(entity, false);
 }
 
 /**
@@ -686,7 +678,7 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 
 	BUG_ON(!bfqq);
-	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
+	bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
 }
 
 /**
@@ -717,11 +709,12 @@ static void bfq_reparent_active_entities(struct bfq_data *bfqd,
 }
 
 /**
- * bfq_destroy_group - destroy @bfqg.
- * @bfqg: the group being destroyed.
+ * bfq_pd_offline - deactivate the entity associated with @pd,
+ *		    and reparent its children entities.
+ * @pd: descriptor of the policy going offline.
  *
- * Destroy @bfqg, making sure that it is not referenced from its parent.
- * blkio already grabs the queue_lock for us, so no need to use RCU-based magic
+ * blkio already grabs the queue_lock for us, so no need to use
+ * RCU-based magic
  */
 static void bfq_pd_offline(struct blkg_policy_data *pd)
 {
@@ -776,10 +769,16 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
 	BUG_ON(bfqg->sched_data.next_in_service);
 	BUG_ON(bfqg->sched_data.in_service_entity);
 
-	__bfq_deactivate_entity(entity, 0);
+	__bfq_deactivate_entity(entity, false);
 	bfq_put_async_queues(bfqd, bfqg);
 	BUG_ON(entity->tree);
 
+	/*
+	 * @blkg is going offline and will be ignored by
+	 * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
+	 * that they don't get lost.  If IOs complete after this point, the
+	 * stats for them will be lost.  Oh well...
+	 */
 	bfqg_stats_xfer_dead(bfqg);
 }
 
@@ -789,46 +788,35 @@ static void bfq_end_wr_async(struct bfq_data *bfqd)
 
 	list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
 		struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+		BUG_ON(!bfqg);
 
 		bfq_end_wr_async_queues(bfqd, bfqg);
 	}
 	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
 }
 
-static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css,
-				       struct cftype *cftype)
-{
-	struct blkcg *blkcg = css_to_blkcg(css);
-	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
-	int ret = -EINVAL;
-
-	spin_lock_irq(&blkcg->lock);
-	ret = bfqgd->weight;
-	spin_unlock_irq(&blkcg->lock);
-
-	return ret;
-}
-
-static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v)
+static int bfq_io_show_weight(struct seq_file *sf, void *v)
 {
 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
 	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+	unsigned int val = 0;
 
-	spin_lock_irq(&blkcg->lock);
-	seq_printf(sf, "%u\n", bfqgd->weight);
-	spin_unlock_irq(&blkcg->lock);
+	if (bfqgd)
+		val = bfqgd->weight;
+
+	seq_printf(sf, "%u\n", val);
 
 	return 0;
 }
 
-static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css,
-					struct cftype *cftype,
-					u64 val)
+static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
+				    struct cftype *cftype,
+				    u64 val)
 {
 	struct blkcg *blkcg = css_to_blkcg(css);
 	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
 	struct blkcg_gq *blkg;
-	int ret = -EINVAL;
+	int ret = -ERANGE;
 
 	if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
 		return ret;
@@ -873,13 +861,18 @@ static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css,
 	return ret;
 }
 
-static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of,
-					     char *buf, size_t nbytes,
-					     loff_t off)
+static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
+				 char *buf, size_t nbytes,
+				 loff_t off)
 {
+	u64 weight;
 	/* First unsigned long found in the file is used */
-	return bfqio_cgroup_weight_write(of_css(of), NULL,
-					 simple_strtoull(strim(buf), NULL, 0));
+	int ret = kstrtoull(strim(buf), 0, &weight);
+
+	if (ret)
+		return ret;
+
+	return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
 }
 
 static int bfqg_print_stat(struct seq_file *sf, void *v)
@@ -899,16 +892,17 @@ static int bfqg_print_rwstat(struct seq_file *sf, void *v)
 static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
 				      struct blkg_policy_data *pd, int off)
 {
-	u64 sum = bfqg_stat_pd_recursive_sum(pd, off);
-
+	u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
+					  &blkcg_policy_bfq, off);
 	return __blkg_prfill_u64(sf, pd, sum);
 }
 
 static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
 					struct blkg_policy_data *pd, int off)
 {
-	struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off);
-
+	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
+							   &blkcg_policy_bfq,
+							   off);
 	return __blkg_prfill_rwstat(sf, pd, &sum);
 }
 
@@ -928,6 +922,41 @@ static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
 	return 0;
 }
 
+static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
+			       int off)
+{
+	u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
+
+	return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
+	return 0;
+}
+
+static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
+					 struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
+					offsetof(struct blkcg_gq, stat_bytes));
+	u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+		atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+	return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
+			  false);
+	return 0;
+}
+
+
 static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
 				      struct blkg_policy_data *pd, int off)
 {
@@ -964,38 +993,15 @@ bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
 	return blkg_to_bfqg(bfqd->queue->root_blkg);
 }
 
-static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
-{
-	struct bfq_group_data *bgd;
-
-	bgd = kzalloc(sizeof(*bgd), GFP_KERNEL);
-	if (!bgd)
-		return NULL;
-	return &bgd->pd;
-}
-
-static void bfq_cpd_free(struct blkcg_policy_data *cpd)
-{
-	kfree(cpd_to_bfqgd(cpd));
-}
-
-static struct cftype bfqio_files_dfl[] = {
+static struct cftype bfq_blkcg_legacy_files[] = {
 	{
-		.name = "weight",
+		.name = "bfq.weight",
 		.flags = CFTYPE_NOT_ON_ROOT,
-		.seq_show = bfqio_cgroup_weight_read_dfl,
-		.write = bfqio_cgroup_weight_write_dfl,
+		.seq_show = bfq_io_show_weight,
+		.write_u64 = bfq_io_set_weight_legacy,
 	},
-	{} /* terminate */
-};
 
-static struct cftype bfqio_files[] = {
-	{
-		.name = "bfq.weight",
-		.read_u64 = bfqio_cgroup_weight_read,
-		.write_u64 = bfqio_cgroup_weight_write,
-	},
-	/* statistics, cover only the tasks in the bfqg */
+	/* statistics, covers only the tasks in the bfqg */
 	{
 		.name = "bfq.time",
 		.private = offsetof(struct bfq_group, stats.time),
@@ -1003,18 +1009,17 @@ static struct cftype bfqio_files[] = {
 	},
 	{
 		.name = "bfq.sectors",
-		.private = offsetof(struct bfq_group, stats.sectors),
-		.seq_show = bfqg_print_stat,
+		.seq_show = bfqg_print_stat_sectors,
 	},
 	{
 		.name = "bfq.io_service_bytes",
-		.private = offsetof(struct bfq_group, stats.service_bytes),
-		.seq_show = bfqg_print_rwstat,
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_bytes,
 	},
 	{
 		.name = "bfq.io_serviced",
-		.private = offsetof(struct bfq_group, stats.serviced),
-		.seq_show = bfqg_print_rwstat,
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_ios,
 	},
 	{
 		.name = "bfq.io_service_time",
@@ -1045,18 +1050,17 @@ static struct cftype bfqio_files[] = {
 	},
 	{
 		.name = "bfq.sectors_recursive",
-		.private = offsetof(struct bfq_group, stats.sectors),
-		.seq_show = bfqg_print_stat_recursive,
+		.seq_show = bfqg_print_stat_sectors_recursive,
 	},
 	{
 		.name = "bfq.io_service_bytes_recursive",
-		.private = offsetof(struct bfq_group, stats.service_bytes),
-		.seq_show = bfqg_print_rwstat_recursive,
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_bytes_recursive,
 	},
 	{
 		.name = "bfq.io_serviced_recursive",
-		.private = offsetof(struct bfq_group, stats.serviced),
-		.seq_show = bfqg_print_rwstat_recursive,
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_ios_recursive,
 	},
 	{
 		.name = "bfq.io_service_time_recursive",
@@ -1102,31 +1106,39 @@ static struct cftype bfqio_files[] = {
 		.private = offsetof(struct bfq_group, stats.dequeue),
 		.seq_show = bfqg_print_stat,
 	},
-	{
-		.name = "bfq.unaccounted_time",
-		.private = offsetof(struct bfq_group, stats.unaccounted_time),
-		.seq_show = bfqg_print_stat,
-	},
 	{ }	/* terminate */
 };
 
-static struct blkcg_policy blkcg_policy_bfq = {
-	.dfl_cftypes            = bfqio_files_dfl,
-	.legacy_cftypes		= bfqio_files,
-
-	.pd_alloc_fn		= bfq_pd_alloc,
-	.pd_init_fn		= bfq_pd_init,
-	.pd_offline_fn		= bfq_pd_offline,
-	.pd_free_fn		= bfq_pd_free,
-	.pd_reset_stats_fn	= bfq_pd_reset_stats,
-
-	.cpd_alloc_fn		= bfq_cpd_alloc,
-	.cpd_init_fn		= bfq_cpd_init,
-	.cpd_bind_fn		= bfq_cpd_init,
-	.cpd_free_fn		= bfq_cpd_free,
+static struct cftype bfq_blkg_files[] = {
+	{
+		.name = "bfq.weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = bfq_io_show_weight,
+		.write = bfq_io_set_weight,
+	},
+	{} /* terminate */
 };
 
-#else
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
+static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg,
+			struct bfq_queue *bfqq, int op, int op_flags) { }
+static inline void
+bfqg_stats_update_io_remove(struct bfq_group *bfqg, int op, int op_flags) { }
+static inline void
+bfqg_stats_update_io_merged(struct bfq_group *bfqg, int op, int op_flags) { }
+static inline void bfqg_stats_update_completion(struct bfq_group *bfqg,
+			uint64_t start_time, uint64_t io_start_time, int op,
+			int op_flags) { }
+static inline void
+bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
+				     struct bfq_group *curr_bfqg) { }
+static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { }
+static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
 
 static void bfq_init_entity(struct bfq_entity *entity,
 			    struct bfq_group *bfqg)
@@ -1150,27 +1162,20 @@ bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
 	return bfqd->root_group;
 }
 
-static void bfq_bfqq_move(struct bfq_data *bfqd,
-			  struct bfq_queue *bfqq,
-			  struct bfq_entity *entity,
-			  struct bfq_group *bfqg)
-{
-}
-
 static void bfq_end_wr_async(struct bfq_data *bfqd)
 {
 	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
 }
 
-static void bfq_disconnect_groups(struct bfq_data *bfqd)
+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
+					    struct blkcg *blkcg)
 {
-	bfq_put_async_queues(bfqd, bfqd->root_group);
+	return bfqd->root_group;
 }
 
-static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
-					      struct blkcg *blkcg)
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
 {
-	return bfqd->root_group;
+	return bfqq->bfqd->root_group;
 }
 
 static struct bfq_group *
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index cf3e9b1..2a2c130 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -1,5 +1,5 @@
 /*
- * Budget Fair Queueing (BFQ) disk scheduler.
+ * Budget Fair Queueing (BFQ) I/O scheduler.
  *
  * Based on ideas and code from CFQ:
  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
@@ -7,25 +7,34 @@
  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
  *		      Paolo Valente <paolo.valente@unimore.it>
  *
- * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org>
  *
  * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
  * file.
  *
- * BFQ is a proportional-share storage-I/O scheduling algorithm based on
- * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
- * measured in number of sectors, to processes instead of time slices. The
- * device is not granted to the in-service process for a given time slice,
- * but until it has exhausted its assigned budget. This change from the time
- * to the service domain allows BFQ to distribute the device throughput
- * among processes as desired, without any distortion due to ZBR, workload
- * fluctuations or other factors. BFQ uses an ad hoc internal scheduler,
- * called B-WF2Q+, to schedule processes according to their budgets. More
- * precisely, BFQ schedules queues associated to processes. Thanks to the
- * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to
- * I/O-bound processes issuing sequential requests (to boost the
- * throughput), and yet guarantee a low latency to interactive and soft
- * real-time applications.
+ * BFQ is a proportional-share I/O scheduler, with some extra
+ * low-latency capabilities. BFQ also supports full hierarchical
+ * scheduling through cgroups. Next paragraphs provide an introduction
+ * on BFQ inner workings. Details on BFQ benefits and usage can be
+ * found in Documentation/block/bfq-iosched.txt.
+ *
+ * BFQ is a proportional-share storage-I/O scheduling algorithm based
+ * on the slice-by-slice service scheme of CFQ. But BFQ assigns
+ * budgets, measured in number of sectors, to processes instead of
+ * time slices. The device is not granted to the in-service process
+ * for a given time slice, but until it has exhausted its assigned
+ * budget. This change from the time to the service domain enables BFQ
+ * to distribute the device throughput among processes as desired,
+ * without any distortion due to throughput fluctuations, or to device
+ * internal queueing. BFQ uses an ad hoc internal scheduler, called
+ * B-WF2Q+, to schedule processes according to their budgets. More
+ * precisely, BFQ schedules queues associated with processes. Thanks to
+ * the accurate policy of B-WF2Q+, BFQ can afford to assign high
+ * budgets to I/O-bound processes issuing sequential requests (to
+ * boost the throughput), and yet guarantee a low latency to
+ * interactive and soft real-time applications.
  *
  * BFQ is described in [1], where also a reference to the initial, more
  * theoretical paper on BFQ can be found. The interested reader can find
@@ -40,10 +49,10 @@
  * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
  * complexity derives from the one introduced with EEVDF in [3].
  *
- * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness
- *     with the BFQ Disk I/O Scheduler'',
- *     Proceedings of the 5th Annual International Systems and Storage
- *     Conference (SYSTOR '12), June 2012.
+ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
+ *   Scheduler", Proceedings of the First Workshop on Mobile System
+ *   Technologies (MST-2015), May 2015.
+ *   http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
  *
  * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
  *
@@ -70,24 +79,23 @@
 #include "bfq.h"
 #include "blk.h"
 
-/* Expiration time of sync (0) and async (1) requests, in jiffies. */
-static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
+/* Expiration time of sync (0) and async (1) requests, in ns. */
+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
 
 /* Maximum backwards seek, in KiB. */
-static const int bfq_back_max = 16 * 1024;
+static const int bfq_back_max = (16 * 1024);
 
 /* Penalty of a backwards seek, in number of sectors. */
 static const int bfq_back_penalty = 2;
 
-/* Idling period duration, in jiffies. */
-static int bfq_slice_idle = HZ / 125;
+/* Idling period duration, in ns. */
+static u32 bfq_slice_idle = (NSEC_PER_SEC / 125);
 
 /* Minimum number of assigned budgets for which stats are safe to compute. */
 static const int bfq_stats_min_budgets = 194;
 
 /* Default maximum budget values, in sectors and number of requests. */
-static const int bfq_default_max_budget = 16 * 1024;
-static const int bfq_max_budget_async_rq = 4;
+static const int bfq_default_max_budget = (16 * 1024);
 
 /*
  * Async to sync throughput distribution is controlled as follows:
@@ -97,23 +105,28 @@ static const int bfq_max_budget_async_rq = 4;
 static const int bfq_async_charge_factor = 10;
 
 /* Default timeout values, in jiffies, approximating CFQ defaults. */
-static const int bfq_timeout_sync = HZ / 8;
-static int bfq_timeout_async = HZ / 25;
+static const int bfq_timeout = (HZ / 8);
 
 struct kmem_cache *bfq_pool;
 
-/* Below this threshold (in ms), we consider thinktime immediate. */
-#define BFQ_MIN_TT		2
+/* Below this threshold (in ns), we consider thinktime immediate. */
+#define BFQ_MIN_TT		(2 * NSEC_PER_MSEC)
 
 /* hw_tag detection: parallel requests threshold and min samples needed. */
 #define BFQ_HW_QUEUE_THRESHOLD	4
 #define BFQ_HW_QUEUE_SAMPLES	32
 
-#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)
-#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
+#define BFQQ_SEEK_THR		(sector_t)(8 * 100)
+#define BFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
+#define BFQQ_CLOSE_THR		(sector_t)(8 * 1024)
+#define BFQQ_SEEKY(bfqq)	(hweight32(bfqq->seek_history) > 32/8)
 
-/* Min samples used for peak rate estimation (for autotuning). */
-#define BFQ_PEAK_RATE_SAMPLES	32
+/* Min number of samples required to perform peak-rate update */
+#define BFQ_RATE_MIN_SAMPLES	32
+/* Min observation time interval required to perform a peak-rate update (ns) */
+#define BFQ_RATE_MIN_INTERVAL	(300*NSEC_PER_MSEC)
+/* Target observation time interval for a peak-rate update (ns) */
+#define BFQ_RATE_REF_INTERVAL	NSEC_PER_SEC
 
 /* Shift used for peak rate fixed precision calculations. */
 #define BFQ_RATE_SHIFT		16
@@ -141,16 +154,24 @@ struct kmem_cache *bfq_pool;
  * The device's speed class is dynamically (re)detected in
  * bfq_update_peak_rate() every time the estimated peak rate is updated.
  *
- * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0]
- * are the reference values for a slow/fast rotational device, whereas
- * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for
- * a slow/fast non-rotational device. Finally, device_speed_thresh are the
- * thresholds used to switch between speed classes.
+ * In the following definitions, R_slow[0]/R_fast[0] and
+ * T_slow[0]/T_fast[0] are the reference values for a slow/fast
+ * rotational device, whereas R_slow[1]/R_fast[1] and
+ * T_slow[1]/T_fast[1] are the reference values for a slow/fast
+ * non-rotational device. Finally, device_speed_thresh are the
+ * thresholds used to switch between speed classes. The reference
+ * rates are not the actual peak rates of the devices used as a
+ * reference, but slightly lower values. The reason for using these
+ * slightly lower values is that the peak-rate estimator tends to
+ * yield slightly lower values than the actual peak rate (it can yield
+ * the actual peak rate only if there is only one process doing I/O,
+ * and the process does sequential I/O).
+ *
  * Both the reference peak rates and the thresholds are measured in
  * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
  */
-static int R_slow[2] = {1536, 10752};
-static int R_fast[2] = {17415, 34791};
+static int R_slow[2] = {1000, 10700};
+static int R_fast[2] = {14000, 33000};
 /*
  * To improve readability, a conversion function is used to initialize the
  * following arrays, which entails that they can be initialized only in a
@@ -183,10 +204,7 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd);
  */
 static int bfq_bio_sync(struct bio *bio)
 {
-	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
-		return 1;
-
-	return 0;
+	return bio_data_dir(bio) == READ || (bio->bi_opf & REQ_SYNC);
 }
 
 /*
@@ -409,11 +427,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd)
  */
 static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
 {
-	return
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-		!bfqd->active_numerous_groups &&
-#endif
-		!bfq_differentiated_weights(bfqd);
+	return !bfq_differentiated_weights(bfqd);
 }
 
 /*
@@ -533,9 +547,19 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
 static unsigned long bfq_serv_to_charge(struct request *rq,
 					struct bfq_queue *bfqq)
 {
-	return blk_rq_sectors(rq) *
-		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) *
-		bfq_async_charge_factor));
+	if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
+		return blk_rq_sectors(rq);
+
+	/*
+	 * If there are no weight-raised queues, then amplify service
+	 * by just the async charge factor; otherwise amplify service
+	 * by twice the async charge factor, to further reduce latency
+	 * for weight-raised queues.
+	 */
+	if (bfqq->bfqd->wr_busy_queues == 0)
+		return blk_rq_sectors(rq) * bfq_async_charge_factor;
+
+	return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor;
 }
 
 /**
@@ -576,7 +600,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
 		entity->budget = new_budget;
 		bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
 					 new_budget);
-		bfq_activate_bfqq(bfqd, bfqq);
+		bfq_requeue_bfqq(bfqd, bfqq);
 	}
 }
 
@@ -590,12 +614,23 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
 	dur = bfqd->RT_prod;
 	do_div(dur, bfqd->peak_rate);
 
-	return dur;
-}
+	/*
+	 * Limit duration between 3 and 13 seconds. Tests show that
+	 * higher values than 13 seconds often yield the opposite of
+	 * the desired result, i.e., worsen responsiveness by letting
+	 * non-interactive and non-soft-real-time applications
+	 * preserve weight raising for a too long time interval.
+	 *
+	 * On the other end, lower values than 3 seconds make it
+	 * difficult for most interactive tasks to complete their jobs
+	 * before weight-raising finishes.
+	 */
+	if (dur > msecs_to_jiffies(13000))
+		dur = msecs_to_jiffies(13000);
+	else if (dur < msecs_to_jiffies(3000))
+		dur = msecs_to_jiffies(3000);
 
-static unsigned int bfq_bfqq_cooperations(struct bfq_queue *bfqq)
-{
-	return bfqq->bic ? bfqq->bic->cooperations : 0;
+	return dur;
 }
 
 static void
@@ -605,31 +640,31 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
 		bfq_mark_bfqq_idle_window(bfqq);
 	else
 		bfq_clear_bfqq_idle_window(bfqq);
+
 	if (bic->saved_IO_bound)
 		bfq_mark_bfqq_IO_bound(bfqq);
 	else
 		bfq_clear_bfqq_IO_bound(bfqq);
-	/* Assuming that the flag in_large_burst is already correctly set */
-	if (bic->wr_time_left && bfqq->bfqd->low_latency &&
-	    !bfq_bfqq_in_large_burst(bfqq) &&
-	    bic->cooperations < bfqq->bfqd->bfq_coop_thresh) {
-		/*
-		 * Start a weight raising period with the duration given by
-		 * the raising_time_left snapshot.
-		 */
-		if (bfq_bfqq_busy(bfqq))
-			bfqq->bfqd->wr_busy_queues++;
-		bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff;
-		bfqq->wr_cur_max_time = bic->wr_time_left;
-		bfqq->last_wr_start_finish = jiffies;
-		bfqq->entity.prio_changed = 1;
+
+	bfqq->wr_coeff = bic->saved_wr_coeff;
+	bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
+	BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt));
+	bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
+	bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;
+	BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish));
+
+	if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
+	    time_is_before_jiffies(bfqq->last_wr_start_finish +
+				   bfqq->wr_cur_max_time))) {
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			     "resume state: switching off wr (%lu + %lu < %lu)",
+			     bfqq->last_wr_start_finish, bfqq->wr_cur_max_time,
+			     jiffies);
+
+		bfqq->wr_coeff = 1;
 	}
-	/*
-	 * Clear wr_time_left to prevent bfq_bfqq_save_state() from
-	 * getting confused about the queue's need of a weight-raising
-	 * period.
-	 */
-	bic->wr_time_left = 0;
+	/* make sure weight will be updated, however we got here */
+	bfqq->entity.prio_changed = 1;
 }
 
 static int bfqq_process_refs(struct bfq_queue *bfqq)
@@ -639,7 +674,7 @@ static int bfqq_process_refs(struct bfq_queue *bfqq)
 	lockdep_assert_held(bfqq->bfqd->queue->queue_lock);
 
 	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
-	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
+	process_refs = bfqq->ref - io_refs - bfqq->entity.on_st;
 	BUG_ON(process_refs < 0);
 	return process_refs;
 }
@@ -654,6 +689,7 @@ static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 		hlist_del_init(&item->burst_list_node);
 	hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
 	bfqd->burst_size = 1;
+	bfqd->burst_parent_entity = bfqq->entity.parent;
 }
 
 /* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
@@ -662,6 +698,10 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 	/* Increment burst size to take into account also bfqq */
 	bfqd->burst_size++;
 
+	bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size);
+
+	BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh);
+
 	if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
 		struct bfq_queue *pos, *bfqq_item;
 		struct hlist_node *n;
@@ -671,15 +711,19 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 		 * other to consider this burst as large.
 		 */
 		bfqd->large_burst = true;
+		bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started");
 
 		/*
 		 * We can now mark all queues in the burst list as
 		 * belonging to a large burst.
 		 */
 		hlist_for_each_entry(bfqq_item, &bfqd->burst_list,
-				     burst_list_node)
+				     burst_list_node) {
 			bfq_mark_bfqq_in_large_burst(bfqq_item);
+			bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst");
+		}
 		bfq_mark_bfqq_in_large_burst(bfqq);
+		bfq_log_bfqq(bfqd, bfqq, "marked in large burst");
 
 		/*
 		 * From now on, and until the current burst finishes, any
@@ -691,67 +735,79 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 		hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,
 					  burst_list_node)
 			hlist_del_init(&pos->burst_list_node);
-	} else /* burst not yet large: add bfqq to the burst list */
+	} else /*
+		* Burst not yet large: add bfqq to the burst list. Do
+		* not increment the ref counter for bfqq, because bfqq
+		* is removed from the burst list before freeing bfqq
+		* in put_queue.
+		*/
 		hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
 }
 
 /*
- * If many queues happen to become active shortly after each other, then,
- * to help the processes associated to these queues get their job done as
- * soon as possible, it is usually better to not grant either weight-raising
- * or device idling to these queues. In this comment we describe, firstly,
- * the reasons why this fact holds, and, secondly, the next function, which
- * implements the main steps needed to properly mark these queues so that
- * they can then be treated in a different way.
+ * If many queues belonging to the same group happen to be created
+ * shortly after each other, then the processes associated with these
+ * queues have typically a common goal. In particular, bursts of queue
+ * creations are usually caused by services or applications that spawn
+ * many parallel threads/processes. Examples are systemd during boot,
+ * or git grep. To help these processes get their job done as soon as
+ * possible, it is usually better to not grant either weight-raising
+ * or device idling to their queues.
  *
- * As for the terminology, we say that a queue becomes active, i.e.,
- * switches from idle to backlogged, either when it is created (as a
- * consequence of the arrival of an I/O request), or, if already existing,
- * when a new request for the queue arrives while the queue is idle.
- * Bursts of activations, i.e., activations of different queues occurring
- * shortly after each other, are typically caused by services or applications
- * that spawn or reactivate many parallel threads/processes. Examples are
- * systemd during boot or git grep.
+ * In this comment we describe, firstly, the reasons why this fact
+ * holds, and, secondly, the next function, which implements the main
+ * steps needed to properly mark these queues so that they can then be
+ * treated in a different way.
  *
- * These services or applications benefit mostly from a high throughput:
- * the quicker the requests of the activated queues are cumulatively served,
- * the sooner the target job of these queues gets completed. As a consequence,
- * weight-raising any of these queues, which also implies idling the device
- * for it, is almost always counterproductive: in most cases it just lowers
- * throughput.
+ * The above services or applications benefit mostly from a high
+ * throughput: the quicker the requests of the activated queues are
+ * cumulatively served, the sooner the target job of these queues gets
+ * completed. As a consequence, weight-raising any of these queues,
+ * which also implies idling the device for it, is almost always
+ * counterproductive. In most cases it just lowers throughput.
  *
- * On the other hand, a burst of activations may be also caused by the start
- * of an application that does not consist in a lot of parallel I/O-bound
- * threads. In fact, with a complex application, the burst may be just a
- * consequence of the fact that several processes need to be executed to
- * start-up the application. To start an application as quickly as possible,
- * the best thing to do is to privilege the I/O related to the application
- * with respect to all other I/O. Therefore, the best strategy to start as
- * quickly as possible an application that causes a burst of activations is
- * to weight-raise all the queues activated during the burst. This is the
+ * On the other hand, a burst of queue creations may be caused also by
+ * the start of an application that does not consist of a lot of
+ * parallel I/O-bound threads. In fact, with a complex application,
+ * several short processes may need to be executed to start-up the
+ * application. In this respect, to start an application as quickly as
+ * possible, the best thing to do is in any case to privilege the I/O
+ * related to the application with respect to all other
+ * I/O. Therefore, the best strategy to start as quickly as possible
+ * an application that causes a burst of queue creations is to
+ * weight-raise all the queues created during the burst. This is the
  * exact opposite of the best strategy for the other type of bursts.
  *
- * In the end, to take the best action for each of the two cases, the two
- * types of bursts need to be distinguished. Fortunately, this seems
- * relatively easy to do, by looking at the sizes of the bursts. In
- * particular, we found a threshold such that bursts with a larger size
- * than that threshold are apparently caused only by services or commands
- * such as systemd or git grep. For brevity, hereafter we call just 'large'
- * these bursts. BFQ *does not* weight-raise queues whose activations occur
- * in a large burst. In addition, for each of these queues BFQ performs or
- * does not perform idling depending on which choice boosts the throughput
- * most. The exact choice depends on the device and request pattern at
+ * In the end, to take the best action for each of the two cases, the
+ * two types of bursts need to be distinguished. Fortunately, this
+ * seems relatively easy, by looking at the sizes of the bursts. In
+ * particular, we found a threshold such that only bursts with a
+ * larger size than that threshold are apparently caused by
+ * services or commands such as systemd or git grep. For brevity,
+ * hereafter we call just 'large' these bursts. BFQ *does not*
+ * weight-raise queues whose creation occurs in a large burst. In
+ * addition, for each of these queues BFQ performs or does not perform
+ * idling depending on which choice boosts the throughput more. The
+ * exact choice depends on the device and request pattern at
  * hand.
  *
- * Turning back to the next function, it implements all the steps needed
- * to detect the occurrence of a large burst and to properly mark all the
- * queues belonging to it (so that they can then be treated in a different
- * way). This goal is achieved by maintaining a special "burst list" that
- * holds, temporarily, the queues that belong to the burst in progress. The
- * list is then used to mark these queues as belonging to a large burst if
- * the burst does become large. The main steps are the following.
+ * Unfortunately, false positives may occur while an interactive task
+ * is starting (e.g., an application is being started). The
+ * consequence is that the queues associated with the task do not
+ * enjoy weight raising as expected. Fortunately these false positives
+ * are very rare. They typically occur if some service happens to
+ * start doing I/O exactly when the interactive task starts.
+ *
+ * Turning back to the next function, it implements all the steps
+ * needed to detect the occurrence of a large burst and to properly
+ * mark all the queues belonging to it (so that they can then be
+ * treated in a different way). This goal is achieved by maintaining a
+ * "burst list" that holds, temporarily, the queues that belong to the
+ * burst in progress. The list is then used to mark these queues as
+ * belonging to a large burst if the burst does become large. The main
+ * steps are the following.
  *
- * . when the very first queue is activated, the queue is inserted into the
+ * . when the very first queue is created, the queue is inserted into the
  *   list (as it could be the first queue in a possible burst)
  *
  * . if the current burst has not yet become large, and a queue Q that does
@@ -772,13 +828,13 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
  *
  *     . the device enters a large-burst mode
  *
- * . if a queue Q that does not belong to the burst is activated while
+ * . if a queue Q that does not belong to the burst is created while
  *   the device is in large-burst mode and shortly after the last time
  *   at which a queue either entered the burst list or was marked as
  *   belonging to the current large burst, then Q is immediately marked
  *   as belonging to a large burst.
  *
- * . if a queue Q that does not belong to the burst is activated a while
+ * . if a queue Q that does not belong to the burst is created a while
  *   later, i.e., not shortly after, than the last time at which a queue
  *   either entered the burst list or was marked as belonging to the
  *   current large burst, then the current burst is deemed as finished and:
@@ -791,52 +847,44 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
  *          in a possible new burst (then the burst list contains just Q
  *          after this step).
  */
-static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			     bool idle_for_long_time)
+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 {
 	/*
-	 * If bfqq happened to be activated in a burst, but has been idle
-	 * for at least as long as an interactive queue, then we assume
-	 * that, in the overall I/O initiated in the burst, the I/O
-	 * associated to bfqq is finished. So bfqq does not need to be
-	 * treated as a queue belonging to a burst anymore. Accordingly,
-	 * we reset bfqq's in_large_burst flag if set, and remove bfqq
-	 * from the burst list if it's there. We do not decrement instead
-	 * burst_size, because the fact that bfqq does not need to belong
-	 * to the burst list any more does not invalidate the fact that
-	 * bfqq may have been activated during the current burst.
-	 */
-	if (idle_for_long_time) {
-		hlist_del_init(&bfqq->burst_list_node);
-		bfq_clear_bfqq_in_large_burst(bfqq);
-	}
-
-	/*
 	 * If bfqq is already in the burst list or is part of a large
-	 * burst, then there is nothing else to do.
+	 * burst, or finally has just been split, then there is
+	 * nothing else to do.
 	 */
 	if (!hlist_unhashed(&bfqq->burst_list_node) ||
-	    bfq_bfqq_in_large_burst(bfqq))
+	    bfq_bfqq_in_large_burst(bfqq) ||
+	    time_is_after_eq_jiffies(bfqq->split_time +
+				     msecs_to_jiffies(10)))
 		return;
 
 	/*
-	 * If bfqq's activation happens late enough, then the current
-	 * burst is finished, and related data structures must be reset.
+	 * If bfqq's creation happens late enough, or bfqq belongs to
+	 * a different group than the burst group, then the current
+	 * burst is finished, and related data structures must be
+	 * reset.
 	 *
-	 * In this respect, consider the special case where bfqq is the very
-	 * first queue being activated. In this case, last_ins_in_burst is
-	 * not yet significant when we get here. But it is easy to verify
-	 * that, whether or not the following condition is true, bfqq will
-	 * end up being inserted into the burst list. In particular the
-	 * list will happen to contain only bfqq. And this is exactly what
-	 * has to happen, as bfqq may be the first queue in a possible
+	 * In this respect, consider the special case where bfqq is
+	 * the very first queue created after BFQ is selected for this
+	 * device. In this case, last_ins_in_burst and
+	 * burst_parent_entity are not yet significant when we get
+	 * here. But it is easy to verify that, whether or not the
+	 * following condition is true, bfqq will end up being
+	 * inserted into the burst list. In particular the list will
+	 * happen to contain only bfqq. And this is exactly what has
+	 * to happen, as bfqq may be the first queue of the first
 	 * burst.
 	 */
 	if (time_is_before_jiffies(bfqd->last_ins_in_burst +
-	    bfqd->bfq_burst_interval)) {
+	    bfqd->bfq_burst_interval) ||
+	    bfqq->entity.parent != bfqd->burst_parent_entity) {
 		bfqd->large_burst = false;
 		bfq_reset_burst_list(bfqd, bfqq);
-		return;
+		bfq_log_bfqq(bfqd, bfqq,
+			"handle_burst: late activation or different group");
+		goto end;
 	}
 
 	/*
@@ -845,8 +893,9 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	 * bfqq as belonging to this large burst immediately.
 	 */
 	if (bfqd->large_burst) {
+		bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst");
 		bfq_mark_bfqq_in_large_burst(bfqq);
-		return;
+		goto end;
 	}
 
 	/*
@@ -855,25 +904,491 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	 * queue. Then we add bfqq to the burst.
 	 */
 	bfq_add_to_burst(bfqd, bfqq);
+end:
+	/*
+	 * At this point, bfqq either has been added to the current
+	 * burst or has caused the current burst to terminate and a
+	 * possible new burst to start. In particular, in the second
+	 * case, bfqq has become the first queue in the possible new
+	 * burst.  In both cases last_ins_in_burst needs to be moved
+	 * forward.
+	 */
+	bfqd->last_ins_in_burst = jiffies;
+
+}
+
+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	return entity->budget - entity->service;
+}
+
+/*
+ * If enough samples have been computed, return the current max budget
+ * stored in bfqd, which is dynamically updated according to the
+ * estimated disk peak rate; otherwise return the default max budget
+ */
+static int bfq_max_budget(struct bfq_data *bfqd)
+{
+	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+		return bfq_default_max_budget;
+	else
+		return bfqd->bfq_max_budget;
+}
+
+/*
+ * Return min budget, which is a fraction of the current or default
+ * max budget (trying with 1/32)
+ */
+static int bfq_min_budget(struct bfq_data *bfqd)
+{
+	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+		return bfq_default_max_budget / 32;
+	else
+		return bfqd->bfq_max_budget / 32;
+}
+
+static void bfq_bfqq_expire(struct bfq_data *bfqd,
+			    struct bfq_queue *bfqq,
+			    bool compensate,
+			    enum bfqq_expiration reason);
+
+/*
+ * The next function, invoked after the input queue bfqq switches from
+ * idle to busy, updates the budget of bfqq. The function also tells
+ * whether the in-service queue should be expired, by returning
+ * true. The purpose of expiring the in-service queue is to give bfqq
+ * the chance to possibly preempt the in-service queue, and the reason
+ * for preempting the in-service queue is to achieve one of the two
+ * goals below.
+ *
+ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
+ * expired because it has remained idle. In particular, bfqq may have
+ * expired for one of the following two reasons:
+ *
+ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and
+ *   did not make it to issue a new request before its last request
+ *   was served;
+ *
+ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue
+ *   a new request before the expiration of the idling-time.
+ *
+ * Even if bfqq has expired for one of the above reasons, the process
+ * associated with the queue may be however issuing requests greedily,
+ * and thus be sensitive to the bandwidth it receives (bfqq may have
+ * remained idle for other reasons: CPU high load, bfqq not enjoying
+ * idling, I/O throttling somewhere in the path from the process to
+ * the I/O scheduler, ...). But if, after every expiration for one of
+ * the above two reasons, bfqq has to wait for the service of at least
+ * one full budget of another queue before being served again, then
+ * bfqq is likely to get a much lower bandwidth or resource time than
+ * its reserved ones. To address this issue, two countermeasures need
+ * to be taken.
+ *
+ * First, the budget and the timestamps of bfqq need to be updated in
+ * a special way on bfqq reactivation: they need to be updated as if
+ * bfqq did not remain idle and did not expire. In fact, if they are
+ * computed as if bfqq expired and remained idle until reactivation,
+ * then the process associated with bfqq is treated as if, instead of
+ * being greedy, it stopped issuing requests when bfqq remained idle,
+ * and restarts issuing requests only on this reactivation. In other
+ * words, the scheduler does not help the process recover the "service
+ * hole" between bfqq expiration and reactivation. As a consequence,
+ * the process receives a lower bandwidth than its reserved one. In
+ * contrast, to recover this hole, the budget must be updated as if
+ * bfqq was not expired at all before this reactivation, i.e., it must
+ * be set to the value of the remaining budget when bfqq was
+ * expired. Along the same line, timestamps need to be assigned the
+ * value they had the last time bfqq was selected for service, i.e.,
+ * before last expiration. Thus timestamps need to be back-shifted
+ * with respect to their normal computation (see [1] for more details
+ * on this tricky aspect).
+ *
+ * Secondly, to allow the process to recover the hole, the in-service
+ * queue must be expired too, to give bfqq the chance to preempt it
+ * immediately. In fact, if bfqq has to wait for a full budget of the
+ * in-service queue to be completed, then it may become impossible to
+ * let the process recover the hole, even if the back-shifted
+ * timestamps of bfqq are lower than those of the in-service queue. If
+ * this happens for most or all of the holes, then the process may not
+ * receive its reserved bandwidth. In this respect, it is worth noting
+ * that, being the service of outstanding requests unpreemptible, a
+ * little fraction of the holes may however be unrecoverable, thereby
+ * causing a little loss of bandwidth.
+ *
+ * The last important point is detecting whether bfqq does need this
+ * bandwidth recovery. In this respect, the next function deems the
+ * process associated with bfqq greedy, and thus allows it to recover
+ * the hole, if: 1) the process is waiting for the arrival of a new
+ * request (which implies that bfqq expired for one of the above two
+ * reasons), and 2) such a request has arrived soon. The first
+ * condition is controlled through the flag non_blocking_wait_rq,
+ * while the second through the flag arrived_in_time. If both
+ * conditions hold, then the function computes the budget in the
+ * above-described special way, and signals that the in-service queue
+ * should be expired. Timestamp back-shifting is done later in
+ * __bfq_activate_entity.
+ *
+ * 2. Reduce latency. Even if timestamps are not backshifted to let
+ * the process associated with bfqq recover a service hole, bfqq may
+ * however happen to have, after being (re)activated, a lower finish
+ * timestamp than the in-service queue.  That is, the next budget of
+ * bfqq may have to be completed before the one of the in-service
+ * queue. If this is the case, then preempting the in-service queue
+ * allows this goal to be achieved, apart from the unpreemptible,
+ * outstanding requests mentioned above.
+ *
+ * Unfortunately, regardless of which of the above two goals one wants
+ * to achieve, service trees need first to be updated to know whether
+ * the in-service queue must be preempted. To have service trees
+ * correctly updated, the in-service queue must be expired and
+ * rescheduled, and bfqq must be scheduled too. This is one of the
+ * most costly operations (in future versions, the scheduling
+ * mechanism may be re-designed in such a way to make it possible to
+ * know whether preemption is needed without needing to update service
+ * trees). In addition, queue preemptions almost always cause random
+ * I/O, and thus loss of throughput. Because of these facts, the next
+ * function adopts the following simple scheme to avoid both costly
+ * operations and too frequent preemptions: it requests the expiration
+ * of the in-service queue (unconditionally) only for queues that need
+ * to recover a hole, or that either are weight-raised or deserve to
+ * be weight-raised.
+ */
+static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
+						struct bfq_queue *bfqq,
+						bool arrived_in_time,
+						bool wr_or_deserves_wr)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
+		/*
+		 * We do not clear the flag non_blocking_wait_rq here, as
+		 * the latter is used in bfq_activate_bfqq to signal
+		 * that timestamps need to be back-shifted (and is
+		 * cleared right after).
+		 */
+
+		/*
+		 * In next assignment we rely on that either
+		 * entity->service or entity->budget are not updated
+		 * on expiration if bfqq is empty (see
+		 * __bfq_bfqq_recalc_budget). Thus both quantities
+		 * remain unchanged after such an expiration, and the
+		 * following statement therefore assigns to
+		 * entity->budget the remaining budget on such an
+		 * expiration. For clarity, entity->service is not
+		 * updated on expiration in any case, and, in normal
+		 * operation, is reset only when bfqq is selected for
+		 * service (see bfq_get_next_queue).
+		 */
+		BUG_ON(bfqq->max_budget < 0);
+		entity->budget = min_t(unsigned long,
+				       bfq_bfqq_budget_left(bfqq),
+				       bfqq->max_budget);
+
+		BUG_ON(entity->budget < 0);
+		return true;
+	}
+
+	BUG_ON(bfqq->max_budget < 0);
+	entity->budget = max_t(unsigned long, bfqq->max_budget,
+			       bfq_serv_to_charge(bfqq->next_rq, bfqq));
+	BUG_ON(entity->budget < 0);
+
+	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
+	return wr_or_deserves_wr;
+}
+
+static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
+					     struct bfq_queue *bfqq,
+					     unsigned int old_wr_coeff,
+					     bool wr_or_deserves_wr,
+					     bool interactive,
+					     bool in_burst,
+					     bool soft_rt)
+{
+	if (old_wr_coeff == 1 && wr_or_deserves_wr) {
+		/* start a weight-raising period */
+		if (interactive) {
+			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+		} else {
+			bfqq->wr_start_at_switch_to_srt = jiffies;
+			bfqq->wr_coeff = bfqd->bfq_wr_coeff *
+				BFQ_SOFTRT_WEIGHT_FACTOR;
+			bfqq->wr_cur_max_time =
+				bfqd->bfq_wr_rt_max_time;
+		}
+		/*
+		 * If needed, further reduce budget to make sure it is
+		 * close to bfqq's backlog, so as to reduce the
+		 * scheduling-error component due to a too large
+		 * budget. Do not care about throughput consequences,
+		 * but only about latency. Finally, do not assign a
+		 * too small budget either, to avoid increasing
+		 * latency by causing too frequent expirations.
+		 */
+		bfqq->entity.budget = min_t(unsigned long,
+					    bfqq->entity.budget,
+					    2 * bfq_min_budget(bfqd));
+
+		bfq_log_bfqq(bfqd, bfqq,
+			     "wrais starting at %lu, rais_max_time %u",
+			     jiffies,
+			     jiffies_to_msecs(bfqq->wr_cur_max_time));
+	} else if (old_wr_coeff > 1) {
+		if (interactive) { /* update wr coeff and duration */
+			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+		} else if (in_burst) {
+			bfqq->wr_coeff = 1;
+			bfq_log_bfqq(bfqd, bfqq,
+				     "wrais ending at %lu, rais_max_time %u",
+				     jiffies,
+				     jiffies_to_msecs(bfqq->
+						      wr_cur_max_time));
+		} else if (soft_rt) {
+			/*
+			 * The application is now or still meeting the
+			 * requirements for being deemed soft rt.  We
+			 * can then correctly and safely (re)charge
+			 * the weight-raising duration for the
+			 * application with the weight-raising
+			 * duration for soft rt applications.
+			 *
+			 * In particular, doing this recharge now, i.e.,
+			 * before the weight-raising period for the
+			 * application finishes, reduces the probability
+			 * of the following negative scenario:
+			 * 1) the weight of a soft rt application is
+			 *    raised at startup (as for any newly
+			 *    created application),
+			 * 2) since the application is not interactive,
+			 *    at a certain time weight-raising is
+			 *    stopped for the application,
+			 * 3) at that time the application happens to
+			 *    still have pending requests, and hence
+			 *    is destined to not have a chance to be
+			 *    deemed soft rt before these requests are
+			 *    completed (see the comments to the
+			 *    function bfq_bfqq_softrt_next_start()
+			 *    for details on soft rt detection),
+			 * 4) these pending requests experience a high
+			 *    latency because the application is not
+			 *    weight-raised while they are pending.
+			 */
+			if (bfqq->wr_cur_max_time !=
+				bfqd->bfq_wr_rt_max_time) {
+				bfqq->wr_start_at_switch_to_srt =
+					bfqq->last_wr_start_finish;
+                BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish));
+
+				bfqq->wr_cur_max_time =
+					bfqd->bfq_wr_rt_max_time;
+				bfqq->wr_coeff = bfqd->bfq_wr_coeff *
+					BFQ_SOFTRT_WEIGHT_FACTOR;
+				bfq_log_bfqq(bfqd, bfqq,
+					     "switching to soft_rt wr");
+			} else
+				bfq_log_bfqq(bfqd, bfqq,
+					"moving forward soft_rt wr duration");
+			bfqq->last_wr_start_finish = jiffies;
+		}
+	}
+}
+
+static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
+					struct bfq_queue *bfqq)
+{
+	return bfqq->dispatched == 0 &&
+		time_is_before_jiffies(
+			bfqq->budget_timeout +
+			bfqd->bfq_wr_min_idle_time);
+}
+
+static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
+					     struct bfq_queue *bfqq,
+					     int old_wr_coeff,
+					     struct request *rq,
+					     bool *interactive)
+{
+	bool soft_rt, in_burst,	wr_or_deserves_wr,
+		bfqq_wants_to_preempt,
+		idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq),
+		/*
+		 * See the comments on
+		 * bfq_bfqq_update_budg_for_activation for
+		 * details on the usage of the next variable.
+		 */
+		arrived_in_time =  ktime_get_ns() <=
+			RQ_BIC(rq)->ttime.last_end_request +
+			bfqd->bfq_slice_idle * 3;
+
+	bfq_log_bfqq(bfqd, bfqq,
+		     "bfq_add_request non-busy: "
+		     "jiffies %lu, in_time %d, idle_long %d busyw %d "
+		     "wr_coeff %u",
+		     jiffies, arrived_in_time,
+		     idle_for_long_time,
+		     bfq_bfqq_non_blocking_wait_rq(bfqq),
+		     old_wr_coeff);
+
+	BUG_ON(bfqq->entity.budget < bfqq->entity.service);
+
+	BUG_ON(bfqq == bfqd->in_service_queue);
+	bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq,
+				 req_op(rq), rq->cmd_flags);
+
+	/*
+	 * bfqq deserves to be weight-raised if:
+	 * - it is sync,
+	 * - it does not belong to a large burst,
+	 * - it has been idle for enough time or is soft real-time,
+	 * - is linked to a bfq_io_cq (it is not shared in any sense)
+	 */
+	in_burst = bfq_bfqq_in_large_burst(bfqq);
+	soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
+		!in_burst &&
+		time_is_before_jiffies(bfqq->soft_rt_next_start);
+	*interactive =
+		!in_burst &&
+		idle_for_long_time;
+	wr_or_deserves_wr = bfqd->low_latency &&
+		(bfqq->wr_coeff > 1 ||
+		 (bfq_bfqq_sync(bfqq) &&
+		  bfqq->bic && (*interactive || soft_rt)));
+
+	bfq_log_bfqq(bfqd, bfqq,
+		     "bfq_add_request: "
+		     "in_burst %d, "
+		     "soft_rt %d (next %lu), inter %d, bic %p",
+		     bfq_bfqq_in_large_burst(bfqq), soft_rt,
+		     bfqq->soft_rt_next_start,
+		     *interactive,
+		     bfqq->bic);
+
+	/*
+	 * Using the last flag, update budget and check whether bfqq
+	 * may want to preempt the in-service queue.
+	 */
+	bfqq_wants_to_preempt =
+		bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
+						    arrived_in_time,
+						    wr_or_deserves_wr);
+
+	/*
+	 * If bfqq happened to be activated in a burst, but has been
+	 * idle for much more than an interactive queue, then we
+	 * assume that, in the overall I/O initiated in the burst, the
+	 * I/O associated with bfqq is finished. So bfqq does not need
+	 * to be treated as a queue belonging to a burst
+	 * anymore. Accordingly, we reset bfqq's in_large_burst flag
+	 * if set, and remove bfqq from the burst list if it's
+	 * there. We do not decrement burst_size, because the fact
+	 * that bfqq does not need to belong to the burst list any
+	 * more does not invalidate the fact that bfqq was created in
+	 * a burst.
+	 */
+	if (likely(!bfq_bfqq_just_created(bfqq)) &&
+	    idle_for_long_time &&
+	    time_is_before_jiffies(
+		    bfqq->budget_timeout +
+		    msecs_to_jiffies(10000))) {
+		hlist_del_init(&bfqq->burst_list_node);
+		bfq_clear_bfqq_in_large_burst(bfqq);
+	}
+
+	bfq_clear_bfqq_just_created(bfqq);
+
+	if (!bfq_bfqq_IO_bound(bfqq)) {
+		if (arrived_in_time) {
+			bfqq->requests_within_timer++;
+			if (bfqq->requests_within_timer >=
+			    bfqd->bfq_requests_within_timer)
+				bfq_mark_bfqq_IO_bound(bfqq);
+		} else
+			bfqq->requests_within_timer = 0;
+		bfq_log_bfqq(bfqd, bfqq, "requests in time %d",
+			     bfqq->requests_within_timer);
+	}
+
+	if (bfqd->low_latency) {
+		if (unlikely(time_is_after_jiffies(bfqq->split_time)))
+			/* wraparound */
+			bfqq->split_time =
+				jiffies - bfqd->bfq_wr_min_idle_time - 1;
+
+		if (time_is_before_jiffies(bfqq->split_time +
+					   bfqd->bfq_wr_min_idle_time)) {
+			bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq,
+							 old_wr_coeff,
+							 wr_or_deserves_wr,
+							 *interactive,
+							 in_burst,
+							 soft_rt);
+
+			if (old_wr_coeff != bfqq->wr_coeff)
+				bfqq->entity.prio_changed = 1;
+		}
+	}
+
+	bfqq->last_idle_bklogged = jiffies;
+	bfqq->service_from_backlogged = 0;
+	bfq_clear_bfqq_softrt_update(bfqq);
+
+	bfq_add_bfqq_busy(bfqd, bfqq);
+
+	/*
+	 * Expire in-service queue only if preemption may be needed
+	 * for guarantees. In this respect, the function
+	 * next_queue_may_preempt just checks a simple, necessary
+	 * condition, and not a sufficient condition based on
+	 * timestamps. In fact, for the latter condition to be
+	 * evaluated, timestamps would need first to be updated, and
+	 * this operation is quite costly (see the comments on the
+	 * function bfq_bfqq_update_budg_for_activation).
+	 */
+	if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
+	    bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff &&
+	    next_queue_may_preempt(bfqd)) {
+		struct bfq_queue *in_serv =
+			bfqd->in_service_queue;
+		BUG_ON(in_serv == bfqq);
+
+		bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
+				false, BFQ_BFQQ_PREEMPTED);
+		BUG_ON(in_serv->entity.budget < 0);
+	}
 }
 
 static void bfq_add_request(struct request *rq)
 {
 	struct bfq_queue *bfqq = RQ_BFQQ(rq);
-	struct bfq_entity *entity = &bfqq->entity;
 	struct bfq_data *bfqd = bfqq->bfqd;
 	struct request *next_rq, *prev;
-	unsigned long old_wr_coeff = bfqq->wr_coeff;
+	unsigned int old_wr_coeff = bfqq->wr_coeff;
 	bool interactive = false;
 
-	bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
+	bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s",
+		     blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A");
+
+	if (bfqq->wr_coeff > 1) /* queue is being weight-raised */
+		bfq_log_bfqq(bfqd, bfqq,
+			"raising period dur %u/%u msec, old coeff %u, w %d(%d)",
+			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
+			jiffies_to_msecs(bfqq->wr_cur_max_time),
+			bfqq->wr_coeff,
+			bfqq->entity.weight, bfqq->entity.orig_weight);
+
 	bfqq->queued[rq_is_sync(rq)]++;
 	bfqd->queued++;
 
 	elv_rb_add(&bfqq->sort_list, rq);
 
 	/*
-	 * Check if this request is a better next-serve candidate.
+	 * Check if this request is a better next-to-serve candidate.
 	 */
 	prev = bfqq->next_rq;
 	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
@@ -886,160 +1401,10 @@ static void bfq_add_request(struct request *rq)
 	if (prev != bfqq->next_rq)
 		bfq_pos_tree_add_move(bfqd, bfqq);
 
-	if (!bfq_bfqq_busy(bfqq)) {
-		bool soft_rt, coop_or_in_burst,
-		     idle_for_long_time = time_is_before_jiffies(
-						bfqq->budget_timeout +
-						bfqd->bfq_wr_min_idle_time);
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-		bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq,
-					 rq->cmd_flags);
-#endif
-		if (bfq_bfqq_sync(bfqq)) {
-			bool already_in_burst =
-			   !hlist_unhashed(&bfqq->burst_list_node) ||
-			   bfq_bfqq_in_large_burst(bfqq);
-			bfq_handle_burst(bfqd, bfqq, idle_for_long_time);
-			/*
-			 * If bfqq was not already in the current burst,
-			 * then, at this point, bfqq either has been
-			 * added to the current burst or has caused the
-			 * current burst to terminate. In particular, in
-			 * the second case, bfqq has become the first
-			 * queue in a possible new burst.
-			 * In both cases last_ins_in_burst needs to be
-			 * moved forward.
-			 */
-			if (!already_in_burst)
-				bfqd->last_ins_in_burst = jiffies;
-		}
-
-		coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) ||
-			bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh;
-		soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
-			!coop_or_in_burst &&
-			time_is_before_jiffies(bfqq->soft_rt_next_start);
-		interactive = !coop_or_in_burst && idle_for_long_time;
-		entity->budget = max_t(unsigned long, bfqq->max_budget,
-				       bfq_serv_to_charge(next_rq, bfqq));
-
-		if (!bfq_bfqq_IO_bound(bfqq)) {
-			if (time_before(jiffies,
-					RQ_BIC(rq)->ttime.last_end_request +
-					bfqd->bfq_slice_idle)) {
-				bfqq->requests_within_timer++;
-				if (bfqq->requests_within_timer >=
-				    bfqd->bfq_requests_within_timer)
-					bfq_mark_bfqq_IO_bound(bfqq);
-			} else
-				bfqq->requests_within_timer = 0;
-		}
-
-		if (!bfqd->low_latency)
-			goto add_bfqq_busy;
-
-		if (bfq_bfqq_just_split(bfqq))
-			goto set_prio_changed;
-
-		/*
-		 * If the queue:
-		 * - is not being boosted,
-		 * - has been idle for enough time,
-		 * - is not a sync queue or is linked to a bfq_io_cq (it is
-		 *   shared "for its nature" or it is not shared and its
-		 *   requests have not been redirected to a shared queue)
-		 * start a weight-raising period.
-		 */
-		if (old_wr_coeff == 1 && (interactive || soft_rt) &&
-		    (!bfq_bfqq_sync(bfqq) || bfqq->bic)) {
-			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
-			if (interactive)
-				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
-			else
-				bfqq->wr_cur_max_time =
-					bfqd->bfq_wr_rt_max_time;
-			bfq_log_bfqq(bfqd, bfqq,
-				     "wrais starting at %lu, rais_max_time %u",
-				     jiffies,
-				     jiffies_to_msecs(bfqq->wr_cur_max_time));
-		} else if (old_wr_coeff > 1) {
-			if (interactive)
-				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
-			else if (coop_or_in_burst ||
-				 (bfqq->wr_cur_max_time ==
-				  bfqd->bfq_wr_rt_max_time &&
-				  !soft_rt)) {
-				bfqq->wr_coeff = 1;
-				bfq_log_bfqq(bfqd, bfqq,
-					"wrais ending at %lu, rais_max_time %u",
-					jiffies,
-					jiffies_to_msecs(bfqq->
-						wr_cur_max_time));
-			} else if (time_before(
-					bfqq->last_wr_start_finish +
-					bfqq->wr_cur_max_time,
-					jiffies +
-					bfqd->bfq_wr_rt_max_time) &&
-				   soft_rt) {
-				/*
-				 *
-				 * The remaining weight-raising time is lower
-				 * than bfqd->bfq_wr_rt_max_time, which means
-				 * that the application is enjoying weight
-				 * raising either because deemed soft-rt in
-				 * the near past, or because deemed interactive
-				 * a long ago.
-				 * In both cases, resetting now the current
-				 * remaining weight-raising time for the
-				 * application to the weight-raising duration
-				 * for soft rt applications would not cause any
-				 * latency increase for the application (as the
-				 * new duration would be higher than the
-				 * remaining time).
-				 *
-				 * In addition, the application is now meeting
-				 * the requirements for being deemed soft rt.
-				 * In the end we can correctly and safely
-				 * (re)charge the weight-raising duration for
-				 * the application with the weight-raising
-				 * duration for soft rt applications.
-				 *
-				 * In particular, doing this recharge now, i.e.,
-				 * before the weight-raising period for the
-				 * application finishes, reduces the probability
-				 * of the following negative scenario:
-				 * 1) the weight of a soft rt application is
-				 *    raised at startup (as for any newly
-				 *    created application),
-				 * 2) since the application is not interactive,
-				 *    at a certain time weight-raising is
-				 *    stopped for the application,
-				 * 3) at that time the application happens to
-				 *    still have pending requests, and hence
-				 *    is destined to not have a chance to be
-				 *    deemed soft rt before these requests are
-				 *    completed (see the comments to the
-				 *    function bfq_bfqq_softrt_next_start()
-				 *    for details on soft rt detection),
-				 * 4) these pending requests experience a high
-				 *    latency because the application is not
-				 *    weight-raised while they are pending.
-				 */
-				bfqq->last_wr_start_finish = jiffies;
-				bfqq->wr_cur_max_time =
-					bfqd->bfq_wr_rt_max_time;
-			}
-		}
-set_prio_changed:
-		if (old_wr_coeff != bfqq->wr_coeff)
-			entity->prio_changed = 1;
-add_bfqq_busy:
-		bfqq->last_idle_bklogged = jiffies;
-		bfqq->service_from_backlogged = 0;
-		bfq_clear_bfqq_softrt_update(bfqq);
-		bfq_add_bfqq_busy(bfqd, bfqq);
-	} else {
+	if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
+		bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff,
+						 rq, &interactive);
+	else {
 		if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
 		    time_is_before_jiffies(
 				bfqq->last_wr_start_finish +
@@ -1048,16 +1413,43 @@ static void bfq_add_request(struct request *rq)
 			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
 
 			bfqd->wr_busy_queues++;
-			entity->prio_changed = 1;
+			bfqq->entity.prio_changed = 1;
 			bfq_log_bfqq(bfqd, bfqq,
-			    "non-idle wrais starting at %lu, rais_max_time %u",
-			    jiffies,
-			    jiffies_to_msecs(bfqq->wr_cur_max_time));
+				     "non-idle wrais starting, "
+				     "wr_max_time %u wr_busy %d",
+				     jiffies_to_msecs(bfqq->wr_cur_max_time),
+				     bfqd->wr_busy_queues);
 		}
 		if (prev != bfqq->next_rq)
 			bfq_updated_next_req(bfqd, bfqq);
 	}
 
+	/*
+	 * Assign jiffies to last_wr_start_finish in the following
+	 * cases:
+	 *
+	 * . if bfqq is not going to be weight-raised, because, for
+	 *   non weight-raised queues, last_wr_start_finish stores the
+	 *   arrival time of the last request; as of now, this piece
+	 *   of information is used only for deciding whether to
+	 *   weight-raise async queues
+	 *
+	 * . if bfqq is not weight-raised, because, if bfqq is now
+	 *   switching to weight-raised, then last_wr_start_finish
+	 *   stores the time when weight-raising starts
+	 *
+	 * . if bfqq is interactive, because, regardless of whether
+	 *   bfqq is currently weight-raised, the weight-raising
+	 *   period must start or restart (this case is considered
+	 *   separately because it is not detected by the above
+	 *   conditions, if bfqq is already weight-raised)
+	 *
+	 * last_wr_start_finish has to be updated also if bfqq is soft
+	 * real-time, because the weight-raising period is constantly
+	 * restarted on idle-to-busy transitions for these queues, but
+	 * this is already done in bfq_bfqq_handle_idle_busy_switch if
+	 * needed.
+	 */
 	if (bfqd->low_latency &&
 		(old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
 		bfqq->last_wr_start_finish = jiffies;
@@ -1081,14 +1473,24 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
 	return NULL;
 }
 
+static sector_t get_sdist(sector_t last_pos, struct request *rq)
+{
+	sector_t sdist = 0;
+
+	if (last_pos) {
+		if (last_pos < blk_rq_pos(rq))
+			sdist = blk_rq_pos(rq) - last_pos;
+		else
+			sdist = last_pos - blk_rq_pos(rq);
+	}
+
+	return sdist;
+}
+
 static void bfq_activate_request(struct request_queue *q, struct request *rq)
 {
 	struct bfq_data *bfqd = q->elevator->elevator_data;
-
 	bfqd->rq_in_driver++;
-	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
-	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
-		(unsigned long long) bfqd->last_position);
 }
 
 static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
@@ -1105,6 +1507,9 @@ static void bfq_remove_request(struct request *rq)
 	struct bfq_data *bfqd = bfqq->bfqd;
 	const int sync = rq_is_sync(rq);
 
+	BUG_ON(bfqq->entity.service > bfqq->entity.budget &&
+	       bfqq == bfqd->in_service_queue);
+
 	if (bfqq->next_rq == rq) {
 		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
 		bfq_updated_next_req(bfqd, bfqq);
@@ -1118,8 +1523,25 @@ static void bfq_remove_request(struct request *rq)
 	elv_rb_del(&bfqq->sort_list, rq);
 
 	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
-		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue)
-			bfq_del_bfqq_busy(bfqd, bfqq, 1);
+		BUG_ON(bfqq->entity.budget < 0);
+
+		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
+			bfq_del_bfqq_busy(bfqd, bfqq, false);
+
+			/* bfqq emptied. In normal operation, when
+			 * bfqq is empty, bfqq->entity.service and
+			 * bfqq->entity.budget must contain,
+			 * respectively, the service received and the
+			 * budget used last time bfqq emptied. These
+			 * facts do not hold in this case, as at least
+			 * this last removal occurred while bfqq is
+			 * not in service. To avoid inconsistencies,
+			 * reset both bfqq->entity.service and
+			 * bfqq->entity.budget.
+			 */
+			bfqq->entity.budget = bfqq->entity.service = 0;
+		}
+
 		/*
 		 * Remove queue from request-position tree as it is empty.
 		 */
@@ -1133,9 +1555,8 @@ static void bfq_remove_request(struct request *rq)
 		BUG_ON(bfqq->meta_pending == 0);
 		bfqq->meta_pending--;
 	}
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
-#endif
+	bfqg_stats_update_io_remove(bfqq_group(bfqq), req_op(rq),
+				    rq->cmd_flags);
 }
 
 static int bfq_merge(struct request_queue *q, struct request **req,
@@ -1145,7 +1566,7 @@ static int bfq_merge(struct request_queue *q, struct request **req,
 	struct request *__rq;
 
 	__rq = bfq_find_rq_fmerge(bfqd, bio);
-	if (__rq && elv_rq_merge_ok(__rq, bio)) {
+	if (__rq && elv_bio_merge_ok(__rq, bio)) {
 		*req = __rq;
 		return ELEVATOR_FRONT_MERGE;
 	}
@@ -1190,7 +1611,8 @@ static void bfq_merged_request(struct request_queue *q, struct request *req,
 static void bfq_bio_merged(struct request_queue *q, struct request *req,
 			   struct bio *bio)
 {
-	bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_rw);
+	bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio_op(bio),
+				    bio->bi_opf);
 }
 #endif
 
@@ -1210,7 +1632,7 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq,
 	 */
 	if (bfqq == next_bfqq &&
 	    !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
-	    time_before(next->fifo_time, rq->fifo_time)) {
+	    next->fifo_time < rq->fifo_time) {
 		list_del_init(&rq->queuelist);
 		list_replace_init(&next->queuelist, &rq->queuelist);
 		rq->fifo_time = next->fifo_time;
@@ -1220,21 +1642,31 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq,
 		bfqq->next_rq = rq;
 
 	bfq_remove_request(next);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
-#endif
+	bfqg_stats_update_io_merged(bfqq_group(bfqq), req_op(next),
+				    next->cmd_flags);
 }
 
 /* Must be called with bfqq != NULL */
 static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
 {
 	BUG_ON(!bfqq);
+
 	if (bfq_bfqq_busy(bfqq))
 		bfqq->bfqd->wr_busy_queues--;
 	bfqq->wr_coeff = 1;
 	bfqq->wr_cur_max_time = 0;
-	/* Trigger a weight change on the next activation of the queue */
+	bfqq->last_wr_start_finish = jiffies;
+	/*
+	 * Trigger a weight change on the next invocation of
+	 * __bfq_entity_update_weight_prio.
+	 */
 	bfqq->entity.prio_changed = 1;
+	bfq_log_bfqq(bfqq->bfqd, bfqq,
+		     "end_wr: wrais ending at %lu, rais_max_time %u",
+		     bfqq->last_wr_start_finish,
+		     jiffies_to_msecs(bfqq->wr_cur_max_time));
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d",
+		     bfqq->bfqd->wr_busy_queues);
 }
 
 static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
@@ -1277,7 +1709,7 @@ static int bfq_rq_close_to_sector(void *io_struct, bool request,
 				  sector_t sector)
 {
 	return abs(bfq_io_struct_pos(io_struct, request) - sector) <=
-	       BFQQ_SEEK_THR;
+	       BFQQ_CLOSE_THR;
 }
 
 static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
@@ -1399,7 +1831,7 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
 	 * throughput.
 	 */
 	bfqq->new_bfqq = new_bfqq;
-	atomic_add(process_refs, &new_bfqq->ref);
+	new_bfqq->ref += process_refs;
 	return new_bfqq;
 }
 
@@ -1430,9 +1862,23 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
 }
 
 /*
- * Attempt to schedule a merge of bfqq with the currently in-service queue
- * or with a close queue among the scheduled queues.
- * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue
+ * If this function returns true, then bfqq cannot be merged. The idea
+ * is that true cooperation happens very early after processes start
+ * to do I/O. Usually, late cooperations are just accidental false
+ * positives. In case bfqq is weight-raised, such false positives
+ * would evidently degrade latency guarantees for bfqq.
+ */
+bool wr_from_too_long(struct bfq_queue *bfqq)
+{
+	return bfqq->wr_coeff > 1 &&
+		time_is_before_jiffies(bfqq->last_wr_start_finish +
+				       msecs_to_jiffies(100));
+}
+
+/*
+ * Attempt to schedule a merge of bfqq with the currently in-service
+ * queue or with a close queue among the scheduled queues.  Return
+ * NULL if no merge was scheduled, a pointer to the shared bfq_queue
  * structure otherwise.
  *
  * The OOM queue is not allowed to participate to cooperation: in fact, since
@@ -1441,6 +1887,18 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
  * handle merging with the OOM queue would be quite complex and expensive
  * to maintain. Besides, in such a critical condition as an out of memory,
  * the benefits of queue merging may be little relevant, or even negligible.
+ *
+ * Weight-raised queues can be merged only if their weight-raising
+ * period has just started. In fact cooperating processes are usually
+ * started together. Thus, with this filter we avoid false positives
+ * that would jeopardize low-latency guarantees.
+ *
+ * WARNING: queue merging may impair fairness among non-weight raised
+ * queues, for at least two reasons: 1) the original weight of a
+ * merged queue may change during the merged state, 2) even being the
+ * weight the same, a merged queue may be bloated with many more
+ * requests than the ones produced by its originally-associated
+ * process.
  */
 static struct bfq_queue *
 bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
@@ -1450,16 +1908,32 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 
 	if (bfqq->new_bfqq)
 		return bfqq->new_bfqq;
-	if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
+
+	if (io_struct && wr_from_too_long(bfqq) &&
+	    likely(bfqq != &bfqd->oom_bfqq))
+		bfq_log_bfqq(bfqd, bfqq,
+			     "would have looked for coop, but bfq%d wr",
+			bfqq->pid);
+
+	if (!io_struct ||
+	    wr_from_too_long(bfqq) ||
+	    unlikely(bfqq == &bfqd->oom_bfqq))
 		return NULL;
-	/* If device has only one backlogged bfq_queue, don't search. */
+
+	/* If there is only one backlogged queue, don't search. */
 	if (bfqd->busy_queues == 1)
 		return NULL;
 
 	in_service_bfqq = bfqd->in_service_queue;
 
+	if (in_service_bfqq && in_service_bfqq != bfqq &&
+	    bfqd->in_service_bic && wr_from_too_long(in_service_bfqq)
+	    && likely(in_service_bfqq == &bfqd->oom_bfqq))
+		bfq_log_bfqq(bfqd, bfqq,
+		"would have tried merge with in-service-queue, but wr");
+
 	if (!in_service_bfqq || in_service_bfqq == bfqq ||
-	    !bfqd->in_service_bic ||
+	    !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) ||
 	    unlikely(in_service_bfqq == &bfqd->oom_bfqq))
 		goto check_scheduled;
 
@@ -1481,7 +1955,15 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 
 	BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent);
 
-	if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) &&
+	if (new_bfqq && wr_from_too_long(new_bfqq) &&
+	    likely(new_bfqq != &bfqd->oom_bfqq) &&
+	    bfq_may_be_close_cooperator(bfqq, new_bfqq))
+		bfq_log_bfqq(bfqd, bfqq,
+			     "would have merged with bfq%d, but wr",
+			     new_bfqq->pid);
+
+	if (new_bfqq && !wr_from_too_long(new_bfqq) &&
+	    likely(new_bfqq != &bfqd->oom_bfqq) &&
 	    bfq_may_be_close_cooperator(bfqq, new_bfqq))
 		return bfq_setup_merge(bfqq, new_bfqq);
 
@@ -1490,53 +1972,25 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 
 static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
 {
+	struct bfq_io_cq *bic = bfqq->bic;
+
 	/*
 	 * If !bfqq->bic, the queue is already shared or its requests
 	 * have already been redirected to a shared queue; both idle window
 	 * and weight raising state have already been saved. Do nothing.
 	 */
-	if (!bfqq->bic)
+	if (!bic)
 		return;
-	if (bfqq->bic->wr_time_left)
-		/*
-		 * This is the queue of a just-started process, and would
-		 * deserve weight raising: we set wr_time_left to the full
-		 * weight-raising duration to trigger weight-raising when
-		 * and if the queue is split and the first request of the
-		 * queue is enqueued.
-		 */
-		bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd);
-	else if (bfqq->wr_coeff > 1) {
-		unsigned long wr_duration =
-			jiffies - bfqq->last_wr_start_finish;
-		/*
-		 * It may happen that a queue's weight raising period lasts
-		 * longer than its wr_cur_max_time, as weight raising is
-		 * handled only when a request is enqueued or dispatched (it
-		 * does not use any timer). If the weight raising period is
-		 * about to end, don't save it.
-		 */
-		if (bfqq->wr_cur_max_time <= wr_duration)
-			bfqq->bic->wr_time_left = 0;
-		else
-			bfqq->bic->wr_time_left =
-				bfqq->wr_cur_max_time - wr_duration;
-		/*
-		 * The bfq_queue is becoming shared or the requests of the
-		 * process owning the queue are being redirected to a shared
-		 * queue. Stop the weight raising period of the queue, as in
-		 * both cases it should not be owned by an interactive or
-		 * soft real-time application.
-		 */
-		bfq_bfqq_end_wr(bfqq);
-	} else
-		bfqq->bic->wr_time_left = 0;
-	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
-	bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
-	bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
-	bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
-	bfqq->bic->cooperations++;
-	bfqq->bic->failed_cooperations = 0;
+
+	bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
+	bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
+	bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
+	bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
+	bic->saved_wr_coeff = bfqq->wr_coeff;
+	bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
+	bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
+	bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
+	BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish));
 }
 
 static void bfq_get_bic_reference(struct bfq_queue *bfqq)
@@ -1561,6 +2015,40 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
 	if (bfq_bfqq_IO_bound(bfqq))
 		bfq_mark_bfqq_IO_bound(new_bfqq);
 	bfq_clear_bfqq_IO_bound(bfqq);
+
+	/*
+	 * If bfqq is weight-raised, then let new_bfqq inherit
+	 * weight-raising. To reduce false positives, neglect the case
+	 * where bfqq has just been created, but has not yet made it
+	 * to be weight-raised (which may happen because EQM may merge
+	 * bfqq even before bfq_add_request is executed for the first
+	 * time for bfqq). Handling this case would however be very
+	 * easy, thanks to the flag just_created.
+	 */
+	if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) {
+		new_bfqq->wr_coeff = bfqq->wr_coeff;
+		new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time;
+		new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish;
+		new_bfqq->wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
+		if (bfq_bfqq_busy(new_bfqq))
+			bfqd->wr_busy_queues++;
+		new_bfqq->entity.prio_changed = 1;
+		bfq_log_bfqq(bfqd, new_bfqq,
+			     "wr start after merge with %d, rais_max_time %u",
+			     bfqq->pid,
+			     jiffies_to_msecs(bfqq->wr_cur_max_time));
+	}
+
+	if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */
+		bfqq->wr_coeff = 1;
+		bfqq->entity.prio_changed = 1;
+		if (bfq_bfqq_busy(bfqq))
+			bfqd->wr_busy_queues--;
+	}
+
+	bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d",
+		     bfqd->wr_busy_queues);
+
 	/*
 	 * Grab a reference to the bic, to prevent it from being destroyed
 	 * before being possibly touched by a bfq_split_bfqq().
@@ -1587,20 +2075,8 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
 	bfq_put_queue(bfqq);
 }
 
-static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq)
-{
-	struct bfq_io_cq *bic = bfqq->bic;
-	struct bfq_data *bfqd = bfqq->bfqd;
-
-	if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) {
-		bic->failed_cooperations++;
-		if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations)
-			bic->cooperations = 0;
-	}
-}
-
-static int bfq_allow_merge(struct request_queue *q, struct request *rq,
-			   struct bio *bio)
+static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
+			       struct bio *bio)
 {
 	struct bfq_data *bfqd = q->elevator->elevator_data;
 	struct bfq_io_cq *bic;
@@ -1610,7 +2086,7 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq,
 	 * Disallow merge of a sync bio into an async request.
 	 */
 	if (bfq_bio_sync(bio) && !rq_is_sync(rq))
-		return 0;
+		return false;
 
 	/*
 	 * Lookup the bfqq that this bio will be queued with. Allow
@@ -1619,7 +2095,7 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq,
 	 */
 	bic = bfq_bic_lookup(bfqd, current->io_context);
 	if (!bic)
-		return 0;
+		return false;
 
 	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
 	/*
@@ -1636,30 +2112,111 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq,
 			 * to decide whether bio and rq can be merged.
 			 */
 			bfqq = new_bfqq;
-		} else
-			bfq_bfqq_increase_failed_cooperations(bfqq);
+		}
 	}
 
 	return bfqq == RQ_BFQQ(rq);
 }
 
+static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq,
+			      struct request *next)
+{
+	return RQ_BFQQ(rq) == RQ_BFQQ(next);
+}
+
+/*
+ * Set the maximum time for the in-service queue to consume its
+ * budget. This prevents seeky processes from lowering the throughput.
+ * In practice, a time-slice service scheme is used with seeky
+ * processes.
+ */
+static void bfq_set_budget_timeout(struct bfq_data *bfqd,
+				   struct bfq_queue *bfqq)
+{
+	unsigned int timeout_coeff;
+
+	if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)
+		timeout_coeff = 1;
+	else
+		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
+
+	bfqd->last_budget_start = ktime_get();
+
+	bfqq->budget_timeout = jiffies +
+		bfqd->bfq_timeout * timeout_coeff;
+
+	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
+		jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff));
+}
+
 static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
 				       struct bfq_queue *bfqq)
 {
 	if (bfqq) {
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
 		bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
-#endif
 		bfq_mark_bfqq_must_alloc(bfqq);
-		bfq_mark_bfqq_budget_new(bfqq);
 		bfq_clear_bfqq_fifo_expire(bfqq);
 
 		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
 
+		BUG_ON(bfqq == bfqd->in_service_queue);
+		BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
+
+		if (time_is_before_jiffies(bfqq->last_wr_start_finish) &&
+		    bfqq->wr_coeff > 1 &&
+		    bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
+		    time_is_before_jiffies(bfqq->budget_timeout)) {
+			/*
+			 * For soft real-time queues, move the start
+			 * of the weight-raising period forward by the
+			 * time the queue has not received any
+			 * service. Otherwise, a relatively long
+			 * service delay is likely to cause the
+			 * weight-raising period of the queue to end,
+			 * because of the short duration of the
+			 * weight-raising period of a soft real-time
+			 * queue.  It is worth noting that this move
+			 * is not so dangerous for the other queues,
+			 * because soft real-time queues are not
+			 * greedy.
+			 *
+			 * To not add a further variable, we use the
+			 * overloaded field budget_timeout to
+			 * determine for how long the queue has not
+			 * received service, i.e., how much time has
+			 * elapsed since the queue expired. However,
+			 * this is a little imprecise, because
+			 * budget_timeout is set to jiffies if bfqq
+			 * not only expires, but also remains with no
+			 * request.
+			 */
+			if (time_after(bfqq->budget_timeout,
+				       bfqq->last_wr_start_finish))
+				bfqq->last_wr_start_finish +=
+					jiffies - bfqq->budget_timeout;
+			else
+				bfqq->last_wr_start_finish = jiffies;
+
+			if (time_is_after_jiffies(bfqq->last_wr_start_finish)) {
+			       pr_crit(
+			       "BFQ WARNING:last %lu budget %lu jiffies %lu",
+			       bfqq->last_wr_start_finish,
+			       bfqq->budget_timeout,
+			       jiffies);
+			       pr_crit("diff %lu", jiffies -
+				       max_t(unsigned long,
+					     bfqq->last_wr_start_finish,
+					     bfqq->budget_timeout));
+			       bfqq->last_wr_start_finish = jiffies;
+			}
+		}
+
+		bfq_set_budget_timeout(bfqd, bfqq);
 		bfq_log_bfqq(bfqd, bfqq,
 			     "set_in_service_queue, cur-budget = %d",
 			     bfqq->entity.budget);
-	}
+	} else
+		bfq_log(bfqd, "set_in_service_queue: NULL");
 
 	bfqd->in_service_queue = bfqq;
 }
@@ -1675,36 +2232,11 @@ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
 	return bfqq;
 }
 
-/*
- * If enough samples have been computed, return the current max budget
- * stored in bfqd, which is dynamically updated according to the
- * estimated disk peak rate; otherwise return the default max budget
- */
-static int bfq_max_budget(struct bfq_data *bfqd)
-{
-	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
-		return bfq_default_max_budget;
-	else
-		return bfqd->bfq_max_budget;
-}
-
-/*
- * Return min budget, which is a fraction of the current or default
- * max budget (trying with 1/32)
- */
-static int bfq_min_budget(struct bfq_data *bfqd)
-{
-	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
-		return bfq_default_max_budget / 32;
-	else
-		return bfqd->bfq_max_budget / 32;
-}
-
 static void bfq_arm_slice_timer(struct bfq_data *bfqd)
 {
 	struct bfq_queue *bfqq = bfqd->in_service_queue;
 	struct bfq_io_cq *bic;
-	unsigned long sl;
+	u32 sl;
 
 	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
 
@@ -1728,59 +2260,343 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
 	sl = bfqd->bfq_slice_idle;
 	/*
 	 * Unless the queue is being weight-raised or the scenario is
-	 * asymmetric, grant only minimum idle time if the queue either
-	 * has been seeky for long enough or has already proved to be
-	 * constantly seeky.
+	 * asymmetric, grant only minimum idle time if the queue
+	 * is seeky. A long idling is preserved for a weight-raised
+	 * queue, or, more in general, in an asymemtric scenario,
+	 * because a long idling is needed for guaranteeing to a queue
+	 * its reserved share of the throughput (in particular, it is
+	 * needed if the queue has a higher weight than some other
+	 * queue).
 	 */
-	if (bfq_sample_valid(bfqq->seek_samples) &&
-	    ((BFQQ_SEEKY(bfqq) && bfqq->entity.service >
-				  bfq_max_budget(bfqq->bfqd) / 8) ||
-	      bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 &&
+	if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
 	    bfq_symmetric_scenario(bfqd))
-		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
-	else if (bfqq->wr_coeff > 1)
-		sl = sl * 3;
+		sl = min_t(u32, sl, BFQ_MIN_TT);
+
 	bfqd->last_idling_start = ktime_get();
-	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
+		      HRTIMER_MODE_REL);
 	bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
-#endif
-	bfq_log(bfqd, "arm idle: %u/%u ms",
-		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
+	bfq_log(bfqd, "arm idle: %ld/%ld ms",
+		sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC);
 }
 
 /*
- * Set the maximum time for the in-service queue to consume its
- * budget. This prevents seeky processes from lowering the disk
- * throughput (always guaranteed with a time slice scheme as in CFQ).
+ * In autotuning mode, max_budget is dynamically recomputed as the
+ * amount of sectors transferred in timeout at the estimated peak
+ * rate. This enables BFQ to utilize a full timeslice with a full
+ * budget, even if the in-service queue is served at peak rate. And
+ * this maximises throughput with sequential workloads.
  */
-static void bfq_set_budget_timeout(struct bfq_data *bfqd)
+static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
 {
-	struct bfq_queue *bfqq = bfqd->in_service_queue;
-	unsigned int timeout_coeff;
+	return (u64)bfqd->peak_rate * USEC_PER_MSEC *
+		jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT;
+}
 
-	if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)
-		timeout_coeff = 1;
+/*
+ * Update parameters related to throughput and responsiveness, as a
+ * function of the estimated peak rate. See comments on
+ * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
+ */
+void update_thr_responsiveness_params(struct bfq_data *bfqd)
+{
+	int dev_type = blk_queue_nonrot(bfqd->queue);
+
+	if (bfqd->bfq_user_max_budget == 0) {
+		bfqd->bfq_max_budget =
+			bfq_calc_max_budget(bfqd);
+		BUG_ON(bfqd->bfq_max_budget < 0);
+		bfq_log(bfqd, "new max_budget = %d",
+			bfqd->bfq_max_budget);
+	}
+
+	if (bfqd->device_speed == BFQ_BFQD_FAST &&
+	    bfqd->peak_rate < device_speed_thresh[dev_type]) {
+		bfqd->device_speed = BFQ_BFQD_SLOW;
+		bfqd->RT_prod = R_slow[dev_type] *
+			T_slow[dev_type];
+	} else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
+		   bfqd->peak_rate > device_speed_thresh[dev_type]) {
+		bfqd->device_speed = BFQ_BFQD_FAST;
+		bfqd->RT_prod = R_fast[dev_type] *
+			T_fast[dev_type];
+	}
+
+	bfq_log(bfqd,
+"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
+		dev_type == 0 ? "ROT" : "NONROT",
+		bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW",
+		bfqd->device_speed == BFQ_BFQD_FAST ?
+		(USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT :
+		(USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT,
+		(USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>>
+		BFQ_RATE_SHIFT);
+}
+
+void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq)
+{
+	if (rq != NULL) { /* new rq dispatch now, reset accordingly */
+		bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ;
+		bfqd->peak_rate_samples = 1;
+		bfqd->sequential_samples = 0;
+		bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
+			blk_rq_sectors(rq);
+	} else /* no new rq dispatched, just reset the number of samples */
+		bfqd->peak_rate_samples = 0; /* full re-init on next disp. */
+
+	bfq_log(bfqd,
+		"reset_rate_computation at end, sample %u/%u tot_sects %llu",
+		bfqd->peak_rate_samples, bfqd->sequential_samples,
+		bfqd->tot_sectors_dispatched);
+}
+
+void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
+{
+	u32 rate, weight, divisor;
+
+	/*
+	 * For the convergence property to hold (see comments on
+	 * bfq_update_peak_rate()) and for the assessment to be
+	 * reliable, a minimum number of samples must be present, and
+	 * a minimum amount of time must have elapsed. If not so, do
+	 * not compute new rate. Just reset parameters, to get ready
+	 * for a new evaluation attempt.
+	 */
+	if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES ||
+	    bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) {
+		bfq_log(bfqd,
+	"update_rate_reset: only resetting, delta_first %lluus samples %d",
+			bfqd->delta_from_first>>10, bfqd->peak_rate_samples);
+		goto reset_computation;
+	}
+
+	/*
+	 * If a new request completion has occurred after last
+	 * dispatch, then, to approximate the rate at which requests
+	 * have been served by the device, it is more precise to
+	 * extend the observation interval to the last completion.
+	 */
+	bfqd->delta_from_first =
+		max_t(u64, bfqd->delta_from_first,
+		      bfqd->last_completion - bfqd->first_dispatch);
+
+	BUG_ON(bfqd->delta_from_first == 0);
+	/*
+	 * Rate computed in sects/usec, and not sects/nsec, for
+	 * precision issues.
+	 */
+	rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT,
+			div_u64(bfqd->delta_from_first, NSEC_PER_USEC));
+
+	bfq_log(bfqd,
+"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)",
+		bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10,
+		((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT),
+		rate > 20<<BFQ_RATE_SHIFT);
+
+	/*
+	 * Peak rate not updated if:
+	 * - the percentage of sequential dispatches is below 3/4 of the
+	 *   total, and rate is below the current estimated peak rate
+	 * - rate is unreasonably high (> 20M sectors/sec)
+	 */
+	if ((bfqd->peak_rate_samples > (3 * bfqd->sequential_samples)>>2 &&
+	     rate <= bfqd->peak_rate) ||
+		rate > 20<<BFQ_RATE_SHIFT) {
+		bfq_log(bfqd,
+		"update_rate_reset: goto reset, samples %u/%u rate/peak %llu/%llu",
+		bfqd->peak_rate_samples, bfqd->sequential_samples,
+		((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT),
+		((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT));
+		goto reset_computation;
+	} else {
+		bfq_log(bfqd,
+		"update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu",
+		bfqd->peak_rate_samples, bfqd->sequential_samples,
+		((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT),
+		((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT));
+	}
+
+	/*
+	 * We have to update the peak rate, at last! To this purpose,
+	 * we use a low-pass filter. We compute the smoothing constant
+	 * of the filter as a function of the 'weight' of the new
+	 * measured rate.
+	 *
+	 * As can be seen in next formulas, we define this weight as a
+	 * quantity proportional to how sequential the workload is,
+	 * and to how long the observation time interval is.
+	 *
+	 * The weight runs from 0 to 8. The maximum value of the
+	 * weight, 8, yields the minimum value for the smoothing
+	 * constant. At this minimum value for the smoothing constant,
+	 * the measured rate contributes for half of the next value of
+	 * the estimated peak rate.
+	 *
+	 * So, the first step is to compute the weight as a function
+	 * of how sequential the workload is. Note that the weight
+	 * cannot reach 9, because bfqd->sequential_samples cannot
+	 * become equal to bfqd->peak_rate_samples, which, in its
+	 * turn, holds true because bfqd->sequential_samples is not
+	 * incremented for the first sample.
+	 */
+	weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples;
+
+	/*
+	 * Second step: further refine the weight as a function of the
+	 * duration of the observation interval.
+	 */
+	weight = min_t(u32, 8,
+		       div_u64(weight * bfqd->delta_from_first,
+			       BFQ_RATE_REF_INTERVAL));
+
+	/*
+	 * Divisor ranging from 10, for minimum weight, to 2, for
+	 * maximum weight.
+	 */
+	divisor = 10 - weight;
+	BUG_ON(divisor == 0);
+
+	/*
+	 * Finally, update peak rate:
+	 *
+	 * peak_rate = peak_rate * (divisor-1) / divisor  +  rate / divisor
+	 */
+	bfqd->peak_rate *= divisor-1;
+	bfqd->peak_rate /= divisor;
+	rate /= divisor; /* smoothing constant alpha = 1/divisor */
+
+	bfq_log(bfqd,
+		"update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u",
+		divisor,
+		((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT),
+		(u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT));
+
+	BUG_ON(bfqd->peak_rate == 0);
+	BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT);
+
+	bfqd->peak_rate += rate;
+	update_thr_responsiveness_params(bfqd);
+	BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT);
+
+reset_computation:
+	bfq_reset_rate_computation(bfqd, rq);
+}
+
+/*
+ * Update the read/write peak rate (the main quantity used for
+ * auto-tuning, see update_thr_responsiveness_params()).
+ *
+ * It is not trivial to estimate the peak rate (correctly): because of
+ * the presence of sw and hw queues between the scheduler and the
+ * device components that finally serve I/O requests, it is hard to
+ * say exactly when a given dispatched request is served inside the
+ * device, and for how long. As a consequence, it is hard to know
+ * precisely at what rate a given set of requests is actually served
+ * by the device.
+ *
+ * On the opposite end, the dispatch time of any request is trivially
+ * available, and, from this piece of information, the "dispatch rate"
+ * of requests can be immediately computed. So, the idea in the next
+ * function is to use what is known, namely request dispatch times
+ * (plus, when useful, request completion times), to estimate what is
+ * unknown, namely in-device request service rate.
+ *
+ * The main issue is that, because of the above facts, the rate at
+ * which a certain set of requests is dispatched over a certain time
+ * interval can vary greatly with respect to the rate at which the
+ * same requests are then served. But, since the size of any
+ * intermediate queue is limited, and the service scheme is lossless
+ * (no request is silently dropped), the following obvious convergence
+ * property holds: the number of requests dispatched MUST become
+ * closer and closer to the number of requests completed as the
+ * observation interval grows. This is the key property used in
+ * the next function to estimate the peak service rate as a function
+ * of the observed dispatch rate. The function assumes to be invoked
+ * on every request dispatch.
+ */
+void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
+{
+	u64 now_ns = ktime_get_ns();
+
+	if (bfqd->peak_rate_samples == 0) { /* first dispatch */
+		bfq_log(bfqd,
+		"update_peak_rate: goto reset, samples %d",
+				bfqd->peak_rate_samples) ;
+		bfq_reset_rate_computation(bfqd, rq);
+		goto update_last_values; /* will add one sample */
+	}
+
+	/*
+	 * Device idle for very long: the observation interval lasting
+	 * up to this dispatch cannot be a valid observation interval
+	 * for computing a new peak rate (similarly to the late-
+	 * completion event in bfq_completed_request()). Go to
+	 * update_rate_and_reset to have the following three steps
+	 * taken:
+	 * - close the observation interval at the last (previous)
+	 *   request dispatch or completion
+	 * - compute rate, if possible, for that observation interval
+	 * - start a new observation interval with this dispatch
+	 */
+	if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
+	    bfqd->rq_in_driver == 0) {
+		bfq_log(bfqd,
+"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d",
+			(now_ns - bfqd->last_dispatch)>>10,
+			bfqd->peak_rate_samples) ;
+		goto update_rate_and_reset;
+	}
+
+	/* Update sampling information */
+	bfqd->peak_rate_samples++;
+
+	if ((bfqd->rq_in_driver > 0 ||
+		now_ns - bfqd->last_completion < BFQ_MIN_TT)
+	     && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
+		bfqd->sequential_samples++;
+
+	bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
+
+	/* Reset max observed rq size every 32 dispatches */
+	if (likely(bfqd->peak_rate_samples % 32))
+		bfqd->last_rq_max_size =
+			max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size);
 	else
-		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
+		bfqd->last_rq_max_size = blk_rq_sectors(rq);
 
-	bfqd->last_budget_start = ktime_get();
+	bfqd->delta_from_first = now_ns - bfqd->first_dispatch;
 
-	bfq_clear_bfqq_budget_new(bfqq);
-	bfqq->budget_timeout = jiffies +
-		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
+	bfq_log(bfqd,
+	"update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus",
+		bfqd->peak_rate_samples, bfqd->sequential_samples,
+		bfqd->tot_sectors_dispatched,
+		bfqd->delta_from_first>>10);
 
-	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
-		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
-		timeout_coeff));
+	/* Target observation interval not yet reached, go on sampling */
+	if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL)
+		goto update_last_values;
+
+update_rate_and_reset:
+	bfq_update_rate_reset(bfqd, rq);
+update_last_values:
+	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
+	bfqd->last_dispatch = now_ns;
+
+	bfq_log(bfqd,
+	"update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu",
+		(now_ns - bfqd->first_dispatch)>>10,
+		(unsigned long long) bfqd->last_position,
+		((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT));
+	bfq_log(bfqd,
+	"update_peak_rate: samples at end %d", bfqd->peak_rate_samples);
 }
 
 /*
- * Move request from internal lists to the request queue dispatch list.
+ * Move request from internal lists to the dispatch list of the request queue
  */
 static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
 {
-	struct bfq_data *bfqd = q->elevator->elevator_data;
 	struct bfq_queue *bfqq = RQ_BFQQ(rq);
 
 	/*
@@ -1794,15 +2610,10 @@ static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
 	 * incrementing bfqq->dispatched.
 	 */
 	bfqq->dispatched++;
+	bfq_update_peak_rate(q->elevator->elevator_data, rq);
+
 	bfq_remove_request(rq);
 	elv_dispatch_sort(q, rq);
-
-	if (bfq_bfqq_sync(bfqq))
-		bfqd->sync_flight++;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq),
-				   rq->cmd_flags);
-#endif
 }
 
 /*
@@ -1822,25 +2633,16 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
 
 	rq = rq_entry_fifo(bfqq->fifo.next);
 
-	if (time_before(jiffies, rq->fifo_time))
+	if (ktime_get_ns() < rq->fifo_time)
 		return NULL;
 
 	return rq;
 }
 
-static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
-{
-	struct bfq_entity *entity = &bfqq->entity;
-
-	return entity->budget - entity->service;
-}
-
 static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 {
 	BUG_ON(bfqq != bfqd->in_service_queue);
 
-	__bfq_bfqd_reset_in_service(bfqd);
-
 	/*
 	 * If this bfqq is shared between multiple processes, check
 	 * to make sure that those processes are still issuing I/Os
@@ -1851,20 +2653,30 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 		bfq_mark_bfqq_split_coop(bfqq);
 
 	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
-		/*
-		 * Overloading budget_timeout field to store the time
-		 * at which the queue remains with no backlog; used by
-		 * the weight-raising mechanism.
-		 */
-		bfqq->budget_timeout = jiffies;
-		bfq_del_bfqq_busy(bfqd, bfqq, 1);
+		if (bfqq->dispatched == 0)
+			/*
+			 * Overloading budget_timeout field to store
+			 * the time at which the queue remains with no
+			 * backlog and no outstanding request; used by
+			 * the weight-raising mechanism.
+			 */
+			bfqq->budget_timeout = jiffies;
+
+		bfq_del_bfqq_busy(bfqd, bfqq, true);
 	} else {
-		bfq_activate_bfqq(bfqd, bfqq);
+		bfq_requeue_bfqq(bfqd, bfqq);
 		/*
 		 * Resort priority tree of potential close cooperators.
 		 */
 		bfq_pos_tree_add_move(bfqd, bfqq);
 	}
+
+	/*
+	 * All in-service entities must have been properly deactivated
+	 * or requeued before executing the next function, which
+	 * resets all in-service entites as no more in service.
+	 */
+	__bfq_bfqd_reset_in_service(bfqd);
 }
 
 /**
@@ -1883,10 +2695,19 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
 	struct request *next_rq;
 	int budget, min_budget;
 
-	budget = bfqq->max_budget;
+	BUG_ON(bfqq != bfqd->in_service_queue);
+
 	min_budget = bfq_min_budget(bfqd);
 
-	BUG_ON(bfqq != bfqd->in_service_queue);
+	if (bfqq->wr_coeff == 1)
+		budget = bfqq->max_budget;
+	else /*
+	      * Use a constant, low budget for weight-raised queues,
+	      * to help achieve a low latency. Keep it slightly higher
+	      * than the minimum possible budget, to cause a little
+	      * bit fewer expirations.
+	      */
+		budget = 2 * min_budget;
 
 	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
 		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
@@ -1895,7 +2716,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
 	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
 		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
 
-	if (bfq_bfqq_sync(bfqq)) {
+	if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) {
 		switch (reason) {
 		/*
 		 * Caveat: in all the following cases we trade latency
@@ -1937,14 +2758,10 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
 			break;
 		case BFQ_BFQQ_BUDGET_TIMEOUT:
 			/*
-			 * We double the budget here because: 1) it
-			 * gives the chance to boost the throughput if
-			 * this is not a seeky process (which may have
-			 * bumped into this timeout because of, e.g.,
-			 * ZBR), 2) together with charge_full_budget
-			 * it helps give seeky processes higher
-			 * timestamps, and hence be served less
-			 * frequently.
+			 * We double the budget here because it gives
+			 * the chance to boost the throughput if this
+			 * is not a seeky process (and has bumped into
+			 * this timeout because of, e.g., ZBR).
 			 */
 			budget = min(budget * 2, bfqd->bfq_max_budget);
 			break;
@@ -1961,17 +2778,49 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
 			budget = min(budget * 4, bfqd->bfq_max_budget);
 			break;
 		case BFQ_BFQQ_NO_MORE_REQUESTS:
-		       /*
-			* Leave the budget unchanged.
-			*/
+			/*
+			 * For queues that expire for this reason, it
+			 * is particularly important to keep the
+			 * budget close to the actual service they
+			 * need. Doing so reduces the timestamp
+			 * misalignment problem described in the
+			 * comments in the body of
+			 * __bfq_activate_entity. In fact, suppose
+			 * that a queue systematically expires for
+			 * BFQ_BFQQ_NO_MORE_REQUESTS and presents a
+			 * new request in time to enjoy timestamp
+			 * back-shifting. The larger the budget of the
+			 * queue is with respect to the service the
+			 * queue actually requests in each service
+			 * slot, the more times the queue can be
+			 * reactivated with the same virtual finish
+			 * time. It follows that, even if this finish
+			 * time is pushed to the system virtual time
+			 * to reduce the consequent timestamp
+			 * misalignment, the queue unjustly enjoys for
+			 * many re-activations a lower finish time
+			 * than all newly activated queues.
+			 *
+			 * The service needed by bfqq is measured
+			 * quite precisely by bfqq->entity.service.
+			 * Since bfqq does not enjoy device idling,
+			 * bfqq->entity.service is equal to the number
+			 * of sectors that the process associated with
+			 * bfqq requested to read/write before waiting
+			 * for request completions, or blocking for
+			 * other reasons.
+			 */
+			budget = max_t(int, bfqq->entity.service, min_budget);
+			break;
 		default:
 			return;
 		}
-	} else
+	} else if (!bfq_bfqq_sync(bfqq))
 		/*
-		 * Async queues get always the maximum possible budget
-		 * (their ability to dispatch is limited by
-		 * @bfqd->bfq_max_budget_async_rq).
+		 * Async queues get always the maximum possible
+		 * budget, as for them we do not care about latency
+		 * (in addition, their ability to dispatch is limited
+		 * by the charging factor).
 		 */
 		budget = bfqd->bfq_max_budget;
 
@@ -1982,160 +2831,120 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
 		bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
 
 	/*
-	 * Make sure that we have enough budget for the next request.
-	 * Since the finish time of the bfqq must be kept in sync with
-	 * the budget, be sure to call __bfq_bfqq_expire() after the
+	 * If there is still backlog, then assign a new budget, making
+	 * sure that it is large enough for the next request.  Since
+	 * the finish time of bfqq must be kept in sync with the
+	 * budget, be sure to call __bfq_bfqq_expire() *after* this
 	 * update.
+	 *
+	 * If there is no backlog, then no need to update the budget;
+	 * it will be updated on the arrival of a new request.
 	 */
 	next_rq = bfqq->next_rq;
-	if (next_rq)
+	if (next_rq) {
+		BUG_ON(reason == BFQ_BFQQ_TOO_IDLE ||
+		       reason == BFQ_BFQQ_NO_MORE_REQUESTS);
 		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
 					    bfq_serv_to_charge(next_rq, bfqq));
-	else
-		bfqq->entity.budget = bfqq->max_budget;
+		BUG_ON(!bfq_bfqq_busy(bfqq));
+		BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
+	}
 
 	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
 			next_rq ? blk_rq_sectors(next_rq) : 0,
 			bfqq->entity.budget);
 }
 
-static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
-{
-	unsigned long max_budget;
-
-	/*
-	 * The max_budget calculated when autotuning is equal to the
-	 * amount of sectors transfered in timeout_sync at the
-	 * estimated peak rate.
-	 */
-	max_budget = (unsigned long)(peak_rate * 1000 *
-				     timeout >> BFQ_RATE_SHIFT);
-
-	return max_budget;
-}
-
 /*
- * In addition to updating the peak rate, checks whether the process
- * is "slow", and returns 1 if so. This slow flag is used, in addition
- * to the budget timeout, to reduce the amount of service provided to
- * seeky processes, and hence reduce their chances to lower the
- * throughput. See the code for more details.
+ * Return true if the process associated with bfqq is "slow". The slow
+ * flag is used, in addition to the budget timeout, to reduce the
+ * amount of service provided to seeky processes, and thus reduce
+ * their chances to lower the throughput. More details in the comments
+ * on the function bfq_bfqq_expire().
+ *
+ * An important observation is in order: as discussed in the comments
+ * on the function bfq_update_peak_rate(), with devices with internal
+ * queues, it is hard if ever possible to know when and for how long
+ * an I/O request is processed by the device (apart from the trivial
+ * I/O pattern where a new request is dispatched only after the
+ * previous one has been completed). This makes it hard to evaluate
+ * the real rate at which the I/O requests of each bfq_queue are
+ * served.  In fact, for an I/O scheduler like BFQ, serving a
+ * bfq_queue means just dispatching its requests during its service
+ * slot (i.e., until the budget of the queue is exhausted, or the
+ * queue remains idle, or, finally, a timeout fires). But, during the
+ * service slot of a bfq_queue, around 100 ms at most, the device may
+ * be even still processing requests of bfq_queues served in previous
+ * service slots. On the opposite end, the requests of the in-service
+ * bfq_queue may be completed after the service slot of the queue
+ * finishes.
+ *
+ * Anyway, unless more sophisticated solutions are used
+ * (where possible), the sum of the sizes of the requests dispatched
+ * during the service slot of a bfq_queue is probably the only
+ * approximation available for the service received by the bfq_queue
+ * during its service slot. And this sum is the quantity used in this
+ * function to evaluate the I/O speed of a process.
  */
-static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-				 bool compensate, enum bfqq_expiration reason)
+static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+				 bool compensate, enum bfqq_expiration reason,
+				 unsigned long *delta_ms)
 {
-	u64 bw, usecs, expected, timeout;
-	ktime_t delta;
-	int update = 0;
+	ktime_t delta_ktime;
+	u32 delta_usecs;
+	bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */
 
-	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
+	if (!bfq_bfqq_sync(bfqq))
 		return false;
 
 	if (compensate)
-		delta = bfqd->last_idling_start;
+		delta_ktime = bfqd->last_idling_start;
 	else
-		delta = ktime_get();
-	delta = ktime_sub(delta, bfqd->last_budget_start);
-	usecs = ktime_to_us(delta);
-
-	/* Don't trust short/unrealistic values. */
-	if (usecs < 100 || usecs >= LONG_MAX)
-		return false;
-
-	/*
-	 * Calculate the bandwidth for the last slice.  We use a 64 bit
-	 * value to store the peak rate, in sectors per usec in fixed
-	 * point math.  We do so to have enough precision in the estimate
-	 * and to avoid overflows.
-	 */
-	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
-	do_div(bw, (unsigned long)usecs);
-
-	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
-
-	/*
-	 * Use only long (> 20ms) intervals to filter out spikes for
-	 * the peak rate estimation.
-	 */
-	if (usecs > 20000) {
-		if (bw > bfqd->peak_rate ||
-		   (!BFQQ_SEEKY(bfqq) &&
-		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
-			bfq_log(bfqd, "measured bw =%llu", bw);
-			/*
-			 * To smooth oscillations use a low-pass filter with
-			 * alpha=7/8, i.e.,
-			 * new_rate = (7/8) * old_rate + (1/8) * bw
-			 */
-			do_div(bw, 8);
-			if (bw == 0)
-				return 0;
-			bfqd->peak_rate *= 7;
-			do_div(bfqd->peak_rate, 8);
-			bfqd->peak_rate += bw;
-			update = 1;
-			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
-		}
-
-		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
-
-		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
-			bfqd->peak_rate_samples++;
-
-		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
-		    update) {
-			int dev_type = blk_queue_nonrot(bfqd->queue);
-
-			if (bfqd->bfq_user_max_budget == 0) {
-				bfqd->bfq_max_budget =
-					bfq_calc_max_budget(bfqd->peak_rate,
-							    timeout);
-				bfq_log(bfqd, "new max_budget=%d",
-					bfqd->bfq_max_budget);
-			}
-			if (bfqd->device_speed == BFQ_BFQD_FAST &&
-			    bfqd->peak_rate < device_speed_thresh[dev_type]) {
-				bfqd->device_speed = BFQ_BFQD_SLOW;
-				bfqd->RT_prod = R_slow[dev_type] *
-						T_slow[dev_type];
-			} else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
-			    bfqd->peak_rate > device_speed_thresh[dev_type]) {
-				bfqd->device_speed = BFQ_BFQD_FAST;
-				bfqd->RT_prod = R_fast[dev_type] *
-						T_fast[dev_type];
-			}
-		}
+		delta_ktime = ktime_get();
+	delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
+	delta_usecs = ktime_to_us(delta_ktime);
+
+	/* don't trust short/unrealistic values. */
+	if (delta_usecs < 1000 || delta_usecs >= LONG_MAX) {
+		if (blk_queue_nonrot(bfqd->queue))
+			 /*
+			  * give same worst-case guarantees as idling
+			  * for seeky
+			  */
+			*delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC;
+		else /* charge at least one seek */
+			*delta_ms = bfq_slice_idle / NSEC_PER_MSEC;
+
+		bfq_log(bfqd, "bfq_bfqq_is_slow: unrealistic %u", delta_usecs);
+
+		return slow;
 	}
 
-	/*
-	 * If the process has been served for a too short time
-	 * interval to let its possible sequential accesses prevail on
-	 * the initial seek time needed to move the disk head on the
-	 * first sector it requested, then give the process a chance
-	 * and for the moment return false.
-	 */
-	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
-		return false;
+	*delta_ms = delta_usecs / USEC_PER_MSEC;
 
 	/*
-	 * A process is considered ``slow'' (i.e., seeky, so that we
-	 * cannot treat it fairly in the service domain, as it would
-	 * slow down too much the other processes) if, when a slice
-	 * ends for whatever reason, it has received service at a
-	 * rate that would not be high enough to complete the budget
-	 * before the budget timeout expiration.
+	 * Use only long (> 20ms) intervals to filter out excessive
+	 * spikes in service rate estimation.
 	 */
-	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
+	if (delta_usecs > 20000) {
+		/*
+		 * Caveat for rotational devices: processes doing I/O
+		 * in the slower disk zones tend to be slow(er) even
+		 * if not seeky. In this respect, the estimated peak
+		 * rate is likely to be an average over the disk
+		 * surface. Accordingly, to not be too harsh with
+		 * unlucky processes, a process is deemed slow only if
+		 * its rate has been lower than half of the estimated
+		 * peak rate.
+		 */
+		slow = bfqq->entity.service < bfqd->bfq_max_budget / 2;
+		bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d",
+			bfqq->entity.service, bfqd->bfq_max_budget);
+	}
 
-	/*
-	 * Caveat: processes doing IO in the slower disk zones will
-	 * tend to be slow(er) even if not seeky. And the estimated
-	 * peak rate will actually be an average over the disk
-	 * surface. Hence, to not be too harsh with unlucky processes,
-	 * we keep a budget/3 margin of safety before declaring a
-	 * process slow.
-	 */
-	return expected > (4 * bfqq->entity.budget) / 3;
+	bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow);
+
+	return slow;
 }
 
 /*
@@ -2193,20 +3002,35 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
 						struct bfq_queue *bfqq)
 {
+	bfq_log_bfqq(bfqd, bfqq,
+"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u",
+		     bfqq->service_from_backlogged,
+		     bfqd->bfq_wr_max_softrt_rate,
+		     jiffies_to_msecs(HZ * bfqq->service_from_backlogged /
+				      bfqd->bfq_wr_max_softrt_rate));
+
 	return max(bfqq->last_idle_bklogged +
 		   HZ * bfqq->service_from_backlogged /
 		   bfqd->bfq_wr_max_softrt_rate,
-		   jiffies + bfqq->bfqd->bfq_slice_idle + 4);
+		   jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
+}
+
+/*
+ * Return the farthest future time instant according to jiffies
+ * macros.
+ */
+static unsigned long bfq_greatest_from_now(void)
+{
+	return jiffies + MAX_JIFFY_OFFSET;
 }
 
 /*
- * Return the largest-possible time instant such that, for as long as possible,
- * the current time will be lower than this time instant according to the macro
- * time_is_before_jiffies().
+ * Return the farthest past time instant according to jiffies
+ * macros.
  */
-static unsigned long bfq_infinity_from_now(unsigned long now)
+static unsigned long bfq_smallest_from_now(void)
 {
-	return now + ULONG_MAX / 2;
+	return jiffies - MAX_JIFFY_OFFSET;
 }
 
 /**
@@ -2216,28 +3040,24 @@ static unsigned long bfq_infinity_from_now(unsigned long now)
  * @compensate: if true, compensate for the time spent idling.
  * @reason: the reason causing the expiration.
  *
+ * If the process associated with bfqq does slow I/O (e.g., because it
+ * issues random requests), we charge bfqq with the time it has been
+ * in service instead of the service it has received (see
+ * bfq_bfqq_charge_time for details on how this goal is achieved). As
+ * a consequence, bfqq will typically get higher timestamps upon
+ * reactivation, and hence it will be rescheduled as if it had
+ * received more service than what it has actually received. In the
+ * end, bfqq receives less service in proportion to how slowly its
+ * associated process consumes its budgets (and hence how seriously it
+ * tends to lower the throughput). In addition, this time-charging
+ * strategy guarantees time fairness among slow processes. In
+ * contrast, if the process associated with bfqq is not slow, we
+ * charge bfqq exactly with the service it has received.
  *
- * If the process associated to the queue is slow (i.e., seeky), or in
- * case of budget timeout, or, finally, if it is async, we
- * artificially charge it an entire budget (independently of the
- * actual service it received). As a consequence, the queue will get
- * higher timestamps than the correct ones upon reactivation, and
- * hence it will be rescheduled as if it had received more service
- * than what it actually received. In the end, this class of processes
- * will receive less service in proportion to how slowly they consume
- * their budgets (and hence how seriously they tend to lower the
- * throughput).
- *
- * In contrast, when a queue expires because it has been idling for
- * too much or because it exhausted its budget, we do not touch the
- * amount of service it has received. Hence when the queue will be
- * reactivated and its timestamps updated, the latter will be in sync
- * with the actual service received by the queue until expiration.
- *
- * Charging a full budget to the first type of queues and the exact
- * service to the others has the effect of using the WF2Q+ policy to
- * schedule the former on a timeslice basis, without violating the
- * service domain guarantees of the latter.
+ * Charging time to the first type of queues and the exact service to
+ * the other has the effect of using the WF2Q+ policy to schedule the
+ * former on a timeslice basis, without violating service domain
+ * guarantees among the latter.
  */
 static void bfq_bfqq_expire(struct bfq_data *bfqd,
 			    struct bfq_queue *bfqq,
@@ -2245,41 +3065,52 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
 			    enum bfqq_expiration reason)
 {
 	bool slow;
+	unsigned long delta = 0;
+	struct bfq_entity *entity = &bfqq->entity;
 
 	BUG_ON(bfqq != bfqd->in_service_queue);
 
 	/*
-	 * Update disk peak rate for autotuning and check whether the
-	 * process is slow (see bfq_update_peak_rate).
+	 * Check whether the process is slow (see bfq_bfqq_is_slow).
 	 */
-	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
+	slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
 
 	/*
-	 * As above explained, 'punish' slow (i.e., seeky), timed-out
-	 * and async queues, to favor sequential sync workloads.
-	 *
-	 * Processes doing I/O in the slower disk zones will tend to be
-	 * slow(er) even if not seeky. Hence, since the estimated peak
-	 * rate is actually an average over the disk surface, these
-	 * processes may timeout just for bad luck. To avoid punishing
-	 * them we do not charge a full budget to a process that
-	 * succeeded in consuming at least 2/3 of its budget.
+	 * Increase service_from_backlogged before next statement,
+	 * because the possible next invocation of
+	 * bfq_bfqq_charge_time would likely inflate
+	 * entity->service. In contrast, service_from_backlogged must
+	 * contain real service, to enable the soft real-time
+	 * heuristic to correctly compute the bandwidth consumed by
+	 * bfqq.
 	 */
-	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
-		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))
-		bfq_bfqq_charge_full_budget(bfqq);
+	bfqq->service_from_backlogged += entity->service;
 
-	bfqq->service_from_backlogged += bfqq->entity.service;
+	/*
+	 * As above explained, charge slow (typically seeky) and
+	 * timed-out queues with the time and not the service
+	 * received, to favor sequential workloads.
+	 *
+	 * Processes doing I/O in the slower disk zones will tend to
+	 * be slow(er) even if not seeky. Therefore, since the
+	 * estimated peak rate is actually an average over the disk
+	 * surface, these processes may timeout just for bad luck. To
+	 * avoid punishing them, do not charge time to processes that
+	 * succeeded in consuming at least 2/3 of their budget. This
+	 * allows BFQ to preserve enough elasticity to still perform
+	 * bandwidth, and not time, distribution with little unlucky
+	 * or quasi-sequential processes.
+	 */
+	if (bfqq->wr_coeff == 1 &&
+	    (slow ||
+	     (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
+	      bfq_bfqq_budget_left(bfqq) >=  entity->budget / 3)))
+		bfq_bfqq_charge_time(bfqd, bfqq, delta);
 
-	if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
-	    !bfq_bfqq_constantly_seeky(bfqq)) {
-		bfq_mark_bfqq_constantly_seeky(bfqq);
-		if (!blk_queue_nonrot(bfqd->queue))
-			bfqd->const_seeky_busy_in_flight_queues++;
-	}
+	BUG_ON(bfqq->entity.budget < bfqq->entity.service);
 
 	if (reason == BFQ_BFQQ_TOO_IDLE &&
-	    bfqq->entity.service <= 2 * bfqq->entity.budget / 10)
+	    entity->service <= 2 * entity->budget / 10)
 		bfq_clear_bfqq_IO_bound(bfqq);
 
 	if (bfqd->low_latency && bfqq->wr_coeff == 1)
@@ -2288,19 +3119,23 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
 	if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&
 	    RB_EMPTY_ROOT(&bfqq->sort_list)) {
 		/*
-		 * If we get here, and there are no outstanding requests,
-		 * then the request pattern is isochronous (see the comments
-		 * to the function bfq_bfqq_softrt_next_start()). Hence we
-		 * can compute soft_rt_next_start. If, instead, the queue
-		 * still has outstanding requests, then we have to wait
-		 * for the completion of all the outstanding requests to
+		 * If we get here, and there are no outstanding
+		 * requests, then the request pattern is isochronous
+		 * (see the comments on the function
+		 * bfq_bfqq_softrt_next_start()). Thus we can compute
+		 * soft_rt_next_start. If, instead, the queue still
+		 * has outstanding requests, then we have to wait for
+		 * the completion of all the outstanding requests to
 		 * discover whether the request pattern is actually
 		 * isochronous.
 		 */
-		if (bfqq->dispatched == 0)
+		BUG_ON(bfqd->busy_queues < 1);
+		if (bfqq->dispatched == 0) {
 			bfqq->soft_rt_next_start =
 				bfq_bfqq_softrt_next_start(bfqd, bfqq);
-		else {
+			bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu",
+				     bfqq->soft_rt_next_start);
+		} else {
 			/*
 			 * The application is still waiting for the
 			 * completion of one or more requests:
@@ -2317,7 +3152,7 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
 			 *    happened to be in the past.
 			 */
 			bfqq->soft_rt_next_start =
-				bfq_infinity_from_now(jiffies);
+				bfq_greatest_from_now();
 			/*
 			 * Schedule an update of soft_rt_next_start to when
 			 * the task may be discovered to be isochronous.
@@ -2327,15 +3162,27 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
 	}
 
 	bfq_log_bfqq(bfqd, bfqq,
-		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
-		slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
+		"expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)",
+		     reason, slow, bfqq->dispatched,
+		     bfq_bfqq_idle_window(bfqq), entity->weight);
 
 	/*
 	 * Increase, decrease or leave budget unchanged according to
 	 * reason.
 	 */
+	BUG_ON(bfqq->entity.budget < bfqq->entity.service);
 	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
+	BUG_ON(bfqq->next_rq == NULL &&
+	       bfqq->entity.budget < bfqq->entity.service);
 	__bfq_bfqq_expire(bfqd, bfqq);
+
+	BUG_ON(!bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED &&
+		!bfq_class_idle(bfqq));
+
+	if (!bfq_bfqq_busy(bfqq) &&
+	    reason != BFQ_BFQQ_BUDGET_TIMEOUT &&
+	    reason != BFQ_BFQQ_BUDGET_EXHAUSTED)
+		bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
 }
 
 /*
@@ -2345,20 +3192,17 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
  */
 static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
 {
-	if (bfq_bfqq_budget_new(bfqq) ||
-	    time_before(jiffies, bfqq->budget_timeout))
-		return false;
-	return true;
+	return time_is_before_eq_jiffies(bfqq->budget_timeout);
 }
 
 /*
- * If we expire a queue that is waiting for the arrival of a new
- * request, we may prevent the fictitious timestamp back-shifting that
- * allows the guarantees of the queue to be preserved (see [1] for
- * this tricky aspect). Hence we return true only if this condition
- * does not hold, or if the queue is slow enough to deserve only to be
- * kicked off for preserving a high throughput.
-*/
+ * If we expire a queue that is actively waiting (i.e., with the
+ * device idled) for the arrival of a new request, then we may incur
+ * the timestamp misalignment problem described in the body of the
+ * function __bfq_activate_entity. Hence we return true only if this
+ * condition does not hold, or if the queue is slow enough to deserve
+ * only to be kicked off for preserving a high throughput.
+ */
 static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
 {
 	bfq_log_bfqq(bfqq->bfqd, bfqq,
@@ -2400,10 +3244,12 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 {
 	struct bfq_data *bfqd = bfqq->bfqd;
 	bool idling_boosts_thr, idling_boosts_thr_without_issues,
-		all_queues_seeky, on_hdd_and_not_all_queues_seeky,
 		idling_needed_for_service_guarantees,
 		asymmetric_scenario;
 
+	if (bfqd->strict_guarantees)
+		return true;
+
 	/*
 	 * The next variable takes into account the cases where idling
 	 * boosts the throughput.
@@ -2466,74 +3312,27 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 		bfqd->wr_busy_queues == 0;
 
 	/*
-	 * There are then two cases where idling must be performed not
+	 * There is then a case where idling must be performed not
 	 * for throughput concerns, but to preserve service
-	 * guarantees. In the description of these cases, we say, for
-	 * short, that a queue is sequential/random if the process
-	 * associated to the queue issues sequential/random requests
-	 * (in the second case the queue may be tagged as seeky or
-	 * even constantly_seeky).
+	 * guarantees.
 	 *
-	 * To introduce the first case, we note that, since
-	 * bfq_bfqq_idle_window(bfqq) is false if the device is
-	 * NCQ-capable and bfqq is random (see
-	 * bfq_update_idle_window()), then, from the above two
-	 * assignments it follows that
-	 * idling_boosts_thr_without_issues is false if the device is
-	 * NCQ-capable and bfqq is random. Therefore, for this case,
-	 * device idling would never be allowed if we used just
-	 * idling_boosts_thr_without_issues to decide whether to allow
-	 * it. And, beneficially, this would imply that throughput
-	 * would always be boosted also with random I/O on NCQ-capable
-	 * HDDs.
-	 *
-	 * But we must be careful on this point, to avoid an unfair
-	 * treatment for bfqq. In fact, because of the same above
-	 * assignments, idling_boosts_thr_without_issues is, on the
-	 * other hand, true if 1) the device is an HDD and bfqq is
-	 * sequential, and 2) there are no busy weight-raised
-	 * queues. As a consequence, if we used just
-	 * idling_boosts_thr_without_issues to decide whether to idle
-	 * the device, then with an HDD we might easily bump into a
-	 * scenario where queues that are sequential and I/O-bound
-	 * would enjoy idling, whereas random queues would not. The
-	 * latter might then get a low share of the device throughput,
-	 * simply because the former would get many requests served
-	 * after being set as in service, while the latter would not.
-	 *
-	 * To address this issue, we start by setting to true a
-	 * sentinel variable, on_hdd_and_not_all_queues_seeky, if the
-	 * device is rotational and not all queues with pending or
-	 * in-flight requests are constantly seeky (i.e., there are
-	 * active sequential queues, and bfqq might then be mistreated
-	 * if it does not enjoy idling because it is random).
-	 */
-	all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) &&
-			   bfqd->busy_in_flight_queues ==
-			   bfqd->const_seeky_busy_in_flight_queues;
-
-	on_hdd_and_not_all_queues_seeky =
-		!blk_queue_nonrot(bfqd->queue) && !all_queues_seeky;
-
-	/*
-	 * To introduce the second case where idling needs to be
-	 * performed to preserve service guarantees, we can note that
-	 * allowing the drive to enqueue more than one request at a
-	 * time, and hence delegating de facto final scheduling
-	 * decisions to the drive's internal scheduler, causes loss of
-	 * control on the actual request service order. In particular,
-	 * the critical situation is when requests from different
-	 * processes happens to be present, at the same time, in the
-	 * internal queue(s) of the drive. In such a situation, the
-	 * drive, by deciding the service order of the
-	 * internally-queued requests, does determine also the actual
-	 * throughput distribution among these processes. But the
-	 * drive typically has no notion or concern about per-process
-	 * throughput distribution, and makes its decisions only on a
-	 * per-request basis. Therefore, the service distribution
-	 * enforced by the drive's internal scheduler is likely to
-	 * coincide with the desired device-throughput distribution
-	 * only in a completely symmetric scenario where:
+	 * To introduce this case, we can note that allowing the drive
+	 * to enqueue more than one request at a time, and hence
+	 * delegating de facto final scheduling decisions to the
+	 * drive's internal scheduler, entails loss of control on the
+	 * actual request service order. In particular, the critical
+	 * situation is when requests from different processes happen
+	 * to be present, at the same time, in the internal queue(s)
+	 * of the drive. In such a situation, the drive, by deciding
+	 * the service order of the internally-queued requests, does
+	 * determine also the actual throughput distribution among
+	 * these processes. But the drive typically has no notion or
+	 * concern about per-process throughput distribution, and
+	 * makes its decisions only on a per-request basis. Therefore,
+	 * the service distribution enforced by the drive's internal
+	 * scheduler is likely to coincide with the desired
+	 * device-throughput distribution only in a completely
+	 * symmetric scenario where:
 	 * (i)  each of these processes must get the same throughput as
 	 *      the others;
 	 * (ii) all these processes have the same I/O pattern
@@ -2555,26 +3354,53 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 	 * words, only if sub-condition (i) holds, then idling is
 	 * allowed, and the device tends to be prevented from queueing
 	 * many requests, possibly of several processes. The reason
-	 * for not controlling also sub-condition (ii) is that, first,
-	 * in the case of an HDD, the asymmetry in terms of types of
-	 * I/O patterns is already taken in to account in the above
-	 * sentinel variable
-	 * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a
-	 * flash-based device, we prefer however to privilege
-	 * throughput (and idling lowers throughput for this type of
-	 * devices), for the following reasons:
-	 * 1) differently from HDDs, the service time of random
-	 *    requests is not orders of magnitudes lower than the service
-	 *    time of sequential requests; thus, even if processes doing
-	 *    sequential I/O get a preferential treatment with respect to
-	 *    others doing random I/O, the consequences are not as
-	 *    dramatic as with HDDs;
-	 * 2) if a process doing random I/O does need strong
-	 *    throughput guarantees, it is hopefully already being
-	 *    weight-raised, or the user is likely to have assigned it a
-	 *    higher weight than the other processes (and thus
-	 *    sub-condition (i) is likely to be false, which triggers
-	 *    idling).
+	 * for not controlling also sub-condition (ii) is that we
+	 * exploit preemption to preserve guarantees in case of
+	 * symmetric scenarios, even if (ii) does not hold, as
+	 * explained in the next two paragraphs.
+	 *
+	 * Even if a queue, say Q, is expired when it remains idle, Q
+	 * can still preempt the new in-service queue if the next
+	 * request of Q arrives soon (see the comments on
+	 * bfq_bfqq_update_budg_for_activation). If all queues and
+	 * groups have the same weight, this form of preemption,
+	 * combined with the hole-recovery heuristic described in the
+	 * comments on function bfq_bfqq_update_budg_for_activation,
+	 * are enough to preserve a correct bandwidth distribution in
+	 * the mid term, even without idling. In fact, even if not
+	 * idling allows the internal queues of the device to contain
+	 * many requests, and thus to reorder requests, we can rather
+	 * safely assume that the internal scheduler still preserves a
+	 * minimum of mid-term fairness. The motivation for using
+	 * preemption instead of idling is that, by not idling,
+	 * service guarantees are preserved without minimally
+	 * sacrificing throughput. In other words, both a high
+	 * throughput and its desired distribution are obtained.
+	 *
+	 * More precisely, this preemption-based, idleless approach
+	 * provides fairness in terms of IOPS, and not sectors per
+	 * second. This can be seen with a simple example. Suppose
+	 * that there are two queues with the same weight, but that
+	 * the first queue receives requests of 8 sectors, while the
+	 * second queue receives requests of 1024 sectors. In
+	 * addition, suppose that each of the two queues contains at
+	 * most one request at a time, which implies that each queue
+	 * always remains idle after it is served. Finally, after
+	 * remaining idle, each queue receives very quickly a new
+	 * request. It follows that the two queues are served
+	 * alternatively, preempting each other if needed. This
+	 * implies that, although both queues have the same weight,
+	 * the queue with large requests receives a service that is
+	 * 1024/8 times as high as the service received by the other
+	 * queue.
+	 *
+	 * On the other hand, device idling is performed, and thus
+	 * pure sector-domain guarantees are provided, for the
+	 * following queues, which are likely to need stronger
+	 * throughput guarantees: weight-raised queues, and queues
+	 * with a higher weight than other queues. When such queues
+	 * are active, sub-condition (i) is false, which triggers
+	 * device idling.
 	 *
 	 * According to the above considerations, the next variable is
 	 * true (only) if sub-condition (i) holds. To compute the
@@ -2582,7 +3408,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 	 * the function bfq_symmetric_scenario(), but also check
 	 * whether bfqq is being weight-raised, because
 	 * bfq_symmetric_scenario() does not take into account also
-	 * weight-raised queues (see comments to
+	 * weight-raised queues (see comments on
 	 * bfq_weights_tree_add()).
 	 *
 	 * As a side note, it is worth considering that the above
@@ -2604,17 +3430,16 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 	 * bfqq. Such a case is when bfqq became active in a burst of
 	 * queue activations. Queues that became active during a large
 	 * burst benefit only from throughput, as discussed in the
-	 * comments to bfq_handle_burst. Thus, if bfqq became active
+	 * comments on bfq_handle_burst. Thus, if bfqq became active
 	 * in a burst and not idling the device maximizes throughput,
 	 * then the device must no be idled, because not idling the
 	 * device provides bfqq and all other queues in the burst with
-	 * maximum benefit. Combining this and the two cases above, we
-	 * can now establish when idling is actually needed to
-	 * preserve service guarantees.
+	 * maximum benefit. Combining this and the above case, we can
+	 * now establish when idling is actually needed to preserve
+	 * service guarantees.
 	 */
 	idling_needed_for_service_guarantees =
-		(on_hdd_and_not_all_queues_seeky || asymmetric_scenario) &&
-		!bfq_bfqq_in_large_burst(bfqq);
+		asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
 
 	/*
 	 * We have now all the components we need to compute the return
@@ -2624,6 +3449,16 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
 	 * 2) idling either boosts the throughput (without issues), or
 	 *    is necessary to preserve service guarantees.
 	 */
+	bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d",
+		     bfq_bfqq_sync(bfqq), idling_boosts_thr);
+
+	bfq_log_bfqq(bfqd, bfqq,
+		     "may_idle: wr_busy %d boosts %d IO-bound %d guar %d",
+		     bfqd->wr_busy_queues,
+		     idling_boosts_thr_without_issues,
+		     bfq_bfqq_IO_bound(bfqq),
+		     idling_needed_for_service_guarantees);
+
 	return bfq_bfqq_sync(bfqq) &&
 		(idling_boosts_thr_without_issues ||
 		 idling_needed_for_service_guarantees);
@@ -2635,7 +3470,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
  * 1) the queue must remain in service and cannot be expired, and
  * 2) the device must be idled to wait for the possible arrival of a new
  *    request for the queue.
- * See the comments to the function bfq_bfqq_may_idle for the reasons
+ * See the comments on the function bfq_bfqq_may_idle for the reasons
  * why performing device idling is the best choice to boost the throughput
  * and preserve service guarantees when bfq_bfqq_may_idle itself
  * returns true.
@@ -2665,7 +3500,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
 	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
 
 	if (bfq_may_expire_for_budg_timeout(bfqq) &&
-	    !timer_pending(&bfqd->idle_slice_timer) &&
+	    !hrtimer_active(&bfqd->idle_slice_timer) &&
 	    !bfq_bfqq_must_idle(bfqq))
 		goto expire;
 
@@ -2685,7 +3520,8 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
 			 * not disable disk idling even when a new request
 			 * arrives.
 			 */
-			if (timer_pending(&bfqd->idle_slice_timer)) {
+			if (bfq_bfqq_wait_request(bfqq)) {
+				BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer));
 				/*
 				 * If we get here: 1) at least a new request
 				 * has arrived but we have not disabled the
@@ -2700,10 +3536,8 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
 				 * So we disable idling.
 				 */
 				bfq_clear_bfqq_wait_request(bfqq);
-				del_timer(&bfqd->idle_slice_timer);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+				hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
 				bfqg_stats_update_idle_time(bfqq_group(bfqq));
-#endif
 			}
 			goto keep_queue;
 		}
@@ -2714,7 +3548,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
 	 * for a new request, or has requests waiting for a completion and
 	 * may idle after their completion, then keep it anyway.
 	 */
-	if (timer_pending(&bfqd->idle_slice_timer) ||
+	if (hrtimer_active(&bfqd->idle_slice_timer) ||
 	    (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
 		bfqq = NULL;
 		goto keep_queue;
@@ -2736,6 +3570,9 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 	struct bfq_entity *entity = &bfqq->entity;
 
 	if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
+		BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
+		       time_is_after_jiffies(bfqq->last_wr_start_finish));
+
 		bfq_log_bfqq(bfqd, bfqq,
 			"raising period dur %u/%u msec, old coeff %u, w %d(%d)",
 			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
@@ -2749,22 +3586,30 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 			bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
 
 		/*
-		 * If the queue was activated in a burst, or
-		 * too much time has elapsed from the beginning
-		 * of this weight-raising period, or the queue has
-		 * exceeded the acceptable number of cooperations,
-		 * then end weight raising.
+		 * If the queue was activated in a burst, or too much
+		 * time has elapsed from the beginning of this
+		 * weight-raising period, then end weight raising.
 		 */
-		if (bfq_bfqq_in_large_burst(bfqq) ||
-		    bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh ||
-		    time_is_before_jiffies(bfqq->last_wr_start_finish +
-					   bfqq->wr_cur_max_time)) {
-			bfqq->last_wr_start_finish = jiffies;
-			bfq_log_bfqq(bfqd, bfqq,
-				     "wrais ending at %lu, rais_max_time %u",
-				     bfqq->last_wr_start_finish,
-				     jiffies_to_msecs(bfqq->wr_cur_max_time));
+		if (bfq_bfqq_in_large_burst(bfqq))
 			bfq_bfqq_end_wr(bfqq);
+		else if (time_is_before_jiffies(bfqq->last_wr_start_finish +
+					   bfqq->wr_cur_max_time)) {
+			if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time ||
+			time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
+					bfq_wr_duration(bfqd)))
+				bfq_bfqq_end_wr(bfqq);
+			else {
+				/* switch back to interactive wr */
+				bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+				bfqq->last_wr_start_finish =
+					bfqq->wr_start_at_switch_to_srt;
+				BUG_ON(time_is_after_jiffies(
+					       bfqq->last_wr_start_finish));
+				bfqq->entity.prio_changed = 1;
+				bfq_log_bfqq(bfqd, bfqq,
+					"back to interactive wr");
+			}
 		}
 	}
 	/* Update weight both if it must be raised and if it must be lowered */
@@ -2815,13 +3660,29 @@ static int bfq_dispatch_request(struct bfq_data *bfqd,
 		 */
 		if (!bfqd->rq_in_driver)
 			bfq_schedule_dispatch(bfqd);
+		BUG_ON(bfqq->entity.budget < bfqq->entity.service);
 		goto expire;
 	}
 
+	BUG_ON(bfqq->entity.budget < bfqq->entity.service);
 	/* Finally, insert request into driver dispatch list. */
 	bfq_bfqq_served(bfqq, service_to_charge);
+
+	BUG_ON(bfqq->entity.budget < bfqq->entity.service);
+
 	bfq_dispatch_insert(bfqd->queue, rq);
 
+	/*
+	 * If weight raising has to terminate for bfqq, then next
+	 * function causes an immediate update of bfqq's weight,
+	 * without waiting for next activation. As a consequence, on
+	 * expiration, bfqq will be timestamped as if has never been
+	 * weight-raised during this service slot, even if it has
+	 * received part or even most of the service as a
+	 * weight-raised queue. This inflates bfqq's timestamps, which
+	 * is beneficial, as bfqq is then more willing to leave the
+	 * device immediately to possible other weight-raised queues.
+	 */
 	bfq_update_wr_data(bfqd, bfqq);
 
 	bfq_log_bfqq(bfqd, bfqq,
@@ -2837,9 +3698,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd,
 		bfqd->in_service_bic = RQ_BIC(rq);
 	}
 
-	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
-	    dispatched >= bfqd->bfq_max_budget_async_rq) ||
-	    bfq_class_idle(bfqq)))
+	if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
 		goto expire;
 
 	return dispatched;
@@ -2885,8 +3744,8 @@ static int bfq_forced_dispatch(struct bfq_data *bfqd)
 		st = bfq_entity_service_tree(&bfqq->entity);
 
 		dispatched += __bfq_forced_dispatch_bfqq(bfqq);
-		bfqq->max_budget = bfq_max_budget(bfqd);
 
+		bfqq->max_budget = bfq_max_budget(bfqd);
 		bfq_forget_idle(st);
 	}
 
@@ -2899,37 +3758,37 @@ static int bfq_dispatch_requests(struct request_queue *q, int force)
 {
 	struct bfq_data *bfqd = q->elevator->elevator_data;
 	struct bfq_queue *bfqq;
-	int max_dispatch;
 
 	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
+
 	if (bfqd->busy_queues == 0)
 		return 0;
 
 	if (unlikely(force))
 		return bfq_forced_dispatch(bfqd);
 
+	/*
+	 * Force device to serve one request at a time if
+	 * strict_guarantees is true. Forcing this service scheme is
+	 * currently the ONLY way to guarantee that the request
+	 * service order enforced by the scheduler is respected by a
+	 * queueing device. Otherwise the device is free even to make
+	 * some unlucky request wait for as long as the device
+	 * wishes.
+	 *
+	 * Of course, serving one request at at time may cause loss of
+	 * throughput.
+	 */
+	if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
+		return 0;
+
 	bfqq = bfq_select_queue(bfqd);
 	if (!bfqq)
 		return 0;
 
-	if (bfq_class_idle(bfqq))
-		max_dispatch = 1;
-
-	if (!bfq_bfqq_sync(bfqq))
-		max_dispatch = bfqd->bfq_max_budget_async_rq;
-
-	if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) {
-		if (bfqd->busy_queues > 1)
-			return 0;
-		if (bfqq->dispatched >= 4 * max_dispatch)
-			return 0;
-	}
-
-	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
-		return 0;
+	BUG_ON(bfqq->entity.budget < bfqq->entity.service);
 
-	bfq_clear_bfqq_wait_request(bfqq);
-	BUG_ON(timer_pending(&bfqd->idle_slice_timer));
+	BUG_ON(bfq_bfqq_wait_request(bfqq));
 
 	if (!bfq_dispatch_request(bfqd, bfqq))
 		return 0;
@@ -2937,6 +3796,8 @@ static int bfq_dispatch_requests(struct request_queue *q, int force)
 	bfq_log_bfqq(bfqd, bfqq, "dispatched %s request",
 			bfq_bfqq_sync(bfqq) ? "sync" : "async");
 
+	BUG_ON(bfqq->next_rq == NULL &&
+	       bfqq->entity.budget < bfqq->entity.service);
 	return 1;
 }
 
@@ -2948,23 +3809,21 @@ static int bfq_dispatch_requests(struct request_queue *q, int force)
  */
 static void bfq_put_queue(struct bfq_queue *bfqq)
 {
-	struct bfq_data *bfqd = bfqq->bfqd;
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 	struct bfq_group *bfqg = bfqq_group(bfqq);
 #endif
 
-	BUG_ON(atomic_read(&bfqq->ref) <= 0);
+	BUG_ON(bfqq->ref <= 0);
 
-	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
-		     atomic_read(&bfqq->ref));
-	if (!atomic_dec_and_test(&bfqq->ref))
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref);
+	bfqq->ref--;
+	if (bfqq->ref)
 		return;
 
 	BUG_ON(rb_first(&bfqq->sort_list));
 	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
 	BUG_ON(bfqq->entity.tree);
 	BUG_ON(bfq_bfqq_busy(bfqq));
-	BUG_ON(bfqd->in_service_queue == bfqq);
 
 	if (bfq_bfqq_sync(bfqq))
 		/*
@@ -2977,7 +3836,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq)
 		 */
 		hlist_del_init(&bfqq->burst_list_node);
 
-	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq);
 
 	kmem_cache_free(bfq_pool, bfqq);
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
@@ -3011,8 +3870,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 		bfq_schedule_dispatch(bfqd);
 	}
 
-	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
-		     atomic_read(&bfqq->ref));
+	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
 
 	bfq_put_cooperator(bfqq);
 
@@ -3021,28 +3879,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 
 static void bfq_init_icq(struct io_cq *icq)
 {
-	struct bfq_io_cq *bic = icq_to_bic(icq);
-
-	bic->ttime.last_end_request = jiffies;
-	/*
-	 * A newly created bic indicates that the process has just
-	 * started doing I/O, and is probably mapping into memory its
-	 * executable and libraries: it definitely needs weight raising.
-	 * There is however the possibility that the process performs,
-	 * for a while, I/O close to some other process. EQM intercepts
-	 * this behavior and may merge the queue corresponding to the
-	 * process  with some other queue, BEFORE the weight of the queue
-	 * is raised. Merged queues are not weight-raised (they are assumed
-	 * to belong to processes that benefit only from high throughput).
-	 * If the merge is basically the consequence of an accident, then
-	 * the queue will be split soon and will get back its old weight.
-	 * It is then important to write down somewhere that this queue
-	 * does need weight raising, even if it did not make it to get its
-	 * weight raised before being merged. To this purpose, we overload
-	 * the field raising_time_left and assign 1 to it, to mark the queue
-	 * as needing weight raising.
-	 */
-	bic->wr_time_left = 1;
+	icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32);
 }
 
 static void bfq_exit_icq(struct io_cq *icq)
@@ -3050,21 +3887,21 @@ static void bfq_exit_icq(struct io_cq *icq)
 	struct bfq_io_cq *bic = icq_to_bic(icq);
 	struct bfq_data *bfqd = bic_to_bfqd(bic);
 
-	if (bic->bfqq[BLK_RW_ASYNC]) {
-		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
-		bic->bfqq[BLK_RW_ASYNC] = NULL;
+	if (bic_to_bfqq(bic, false)) {
+		bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false));
+		bic_set_bfqq(bic, NULL, false);
 	}
 
-	if (bic->bfqq[BLK_RW_SYNC]) {
+	if (bic_to_bfqq(bic, true)) {
 		/*
 		 * If the bic is using a shared queue, put the reference
 		 * taken on the io_context when the bic started using a
 		 * shared bfq_queue.
 		 */
-		if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))
+		if (bfq_bfqq_coop(bic_to_bfqq(bic, true)))
 			put_io_context(icq->ioc);
-		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
-		bic->bfqq[BLK_RW_SYNC] = NULL;
+		bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true));
+		bic_set_bfqq(bic, NULL, true);
 	}
 }
 
@@ -3072,8 +3909,8 @@ static void bfq_exit_icq(struct io_cq *icq)
  * Update the entity prio values; note that the new values will not
  * be used until the next (re)activation.
  */
-static void
-bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
+static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq,
+				     struct bfq_io_cq *bic)
 {
 	struct task_struct *tsk = current;
 	int ioprio_class;
@@ -3105,7 +3942,7 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
 		break;
 	}
 
-	if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) {
+	if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
 		pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
 			bfqq->new_ioprio);
 		BUG();
@@ -3113,45 +3950,40 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
 
 	bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
 	bfqq->entity.prio_changed = 1;
+	bfq_log_bfqq(bfqq->bfqd, bfqq,
+		     "set_next_ioprio_data: bic_class %d prio %d class %d",
+		     ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class);
 }
 
 static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
 {
-	struct bfq_data *bfqd;
-	struct bfq_queue *bfqq, *new_bfqq;
+	struct bfq_data *bfqd = bic_to_bfqd(bic);
+	struct bfq_queue *bfqq;
 	unsigned long uninitialized_var(flags);
 	int ioprio = bic->icq.ioc->ioprio;
 
-	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data),
-				   &flags);
 	/*
 	 * This condition may trigger on a newly created bic, be sure to
 	 * drop the lock before returning.
 	 */
 	if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))
-		goto out;
+		return;
 
 	bic->ioprio = ioprio;
 
-	bfqq = bic->bfqq[BLK_RW_ASYNC];
+	bfqq = bic_to_bfqq(bic, false);
 	if (bfqq) {
-		new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic,
-					 GFP_ATOMIC);
-		if (new_bfqq) {
-			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
-			bfq_log_bfqq(bfqd, bfqq,
-				     "check_ioprio_change: bfqq %p %d",
-				     bfqq, atomic_read(&bfqq->ref));
-			bfq_put_queue(bfqq);
-		}
+		bfq_put_queue(bfqq);
+		bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
+		bic_set_bfqq(bic, bfqq, false);
+		bfq_log_bfqq(bfqd, bfqq,
+			     "check_ioprio_change: bfqq %p %d",
+			     bfqq, bfqq->ref);
 	}
 
-	bfqq = bic->bfqq[BLK_RW_SYNC];
+	bfqq = bic_to_bfqq(bic, true);
 	if (bfqq)
 		bfq_set_next_ioprio_data(bfqq, bic);
-
-out:
-	bfq_put_bfqd_unlock(bfqd, &flags);
 }
 
 static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
@@ -3160,8 +3992,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	RB_CLEAR_NODE(&bfqq->entity.rb_node);
 	INIT_LIST_HEAD(&bfqq->fifo);
 	INIT_HLIST_NODE(&bfqq->burst_list_node);
+	BUG_ON(!hlist_unhashed(&bfqq->burst_list_node));
 
-	atomic_set(&bfqq->ref, 0);
+	bfqq->ref = 0;
 	bfqq->bfqd = bfqd;
 
 	if (bic)
@@ -3171,6 +4004,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		if (!bfq_class_idle(bfqq))
 			bfq_mark_bfqq_idle_window(bfqq);
 		bfq_mark_bfqq_sync(bfqq);
+		bfq_mark_bfqq_just_created(bfqq);
 	} else
 		bfq_clear_bfqq_sync(bfqq);
 	bfq_mark_bfqq_IO_bound(bfqq);
@@ -3180,72 +4014,19 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	bfqq->pid = pid;
 
 	bfqq->wr_coeff = 1;
-	bfqq->last_wr_start_finish = 0;
+	bfqq->last_wr_start_finish = jiffies;
+	bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now();
+	bfqq->budget_timeout = bfq_smallest_from_now();
+	bfqq->split_time = bfq_smallest_from_now();
+
 	/*
 	 * Set to the value for which bfqq will not be deemed as
 	 * soft rt when it becomes backlogged.
 	 */
-	bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies);
-}
-
-static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
-					      struct bio *bio, int is_sync,
-					      struct bfq_io_cq *bic,
-					      gfp_t gfp_mask)
-{
-	struct bfq_group *bfqg;
-	struct bfq_queue *bfqq, *new_bfqq = NULL;
-	struct blkcg *blkcg;
-
-retry:
-	rcu_read_lock();
-
-	blkcg = bio_blkcg(bio);
-	bfqg = bfq_find_alloc_group(bfqd, blkcg);
-	/* bic always exists here */
-	bfqq = bic_to_bfqq(bic, is_sync);
-
-	/*
-	 * Always try a new alloc if we fall back to the OOM bfqq
-	 * originally, since it should just be a temporary situation.
-	 */
-	if (!bfqq || bfqq == &bfqd->oom_bfqq) {
-		bfqq = NULL;
-		if (new_bfqq) {
-			bfqq = new_bfqq;
-			new_bfqq = NULL;
-		} else if (gfpflags_allow_blocking(gfp_mask)) {
-			rcu_read_unlock();
-			spin_unlock_irq(bfqd->queue->queue_lock);
-			new_bfqq = kmem_cache_alloc_node(bfq_pool,
-					gfp_mask | __GFP_ZERO,
-					bfqd->queue->node);
-			spin_lock_irq(bfqd->queue->queue_lock);
-			if (new_bfqq)
-				goto retry;
-		} else {
-			bfqq = kmem_cache_alloc_node(bfq_pool,
-					gfp_mask | __GFP_ZERO,
-					bfqd->queue->node);
-		}
-
-		if (bfqq) {
-			bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
-				      is_sync);
-			bfq_init_entity(&bfqq->entity, bfqg);
-			bfq_log_bfqq(bfqd, bfqq, "allocated");
-		} else {
-			bfqq = &bfqd->oom_bfqq;
-			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
-		}
-	}
+	bfqq->soft_rt_next_start = bfq_greatest_from_now();
 
-	if (new_bfqq)
-		kmem_cache_free(bfq_pool, new_bfqq);
-
-	rcu_read_unlock();
-
-	return bfqq;
+	/* first request is almost certainly seeky */
+	bfqq->seek_history = 1;
 }
 
 static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
@@ -3268,90 +4049,86 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
 }
 
 static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
-				       struct bio *bio, int is_sync,
-				       struct bfq_io_cq *bic, gfp_t gfp_mask)
+				       struct bio *bio, bool is_sync,
+				       struct bfq_io_cq *bic)
 {
 	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
 	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
 	struct bfq_queue **async_bfqq = NULL;
-	struct bfq_queue *bfqq = NULL;
+	struct bfq_queue *bfqq;
+	struct bfq_group *bfqg;
 
-	if (!is_sync) {
-		struct blkcg *blkcg;
-		struct bfq_group *bfqg;
+	rcu_read_lock();
+
+	bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
+	if (!bfqg) {
+		bfqq = &bfqd->oom_bfqq;
+		goto out;
+	}
 
-		rcu_read_lock();
-		blkcg = bio_blkcg(bio);
-		rcu_read_unlock();
-		bfqg = bfq_find_alloc_group(bfqd, blkcg);
+	if (!is_sync) {
 		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
 						  ioprio);
 		bfqq = *async_bfqq;
+		if (bfqq)
+			goto out;
 	}
 
-	if (!bfqq)
-		bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask);
+	bfqq = kmem_cache_alloc_node(bfq_pool, GFP_NOWAIT | __GFP_ZERO,
+				     bfqd->queue->node);
+
+	if (bfqq) {
+		bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
+			      is_sync);
+		bfq_init_entity(&bfqq->entity, bfqg);
+		bfq_log_bfqq(bfqd, bfqq, "allocated");
+	} else {
+		bfqq = &bfqd->oom_bfqq;
+		bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
+		goto out;
+	}
 
 	/*
 	 * Pin the queue now that it's allocated, scheduler exit will
 	 * prune it.
 	 */
-	if (!is_sync && !(*async_bfqq)) {
-		atomic_inc(&bfqq->ref);
+	if (async_bfqq) {
+		bfqq->ref++;
 		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
-			     bfqq, atomic_read(&bfqq->ref));
+			     bfqq, bfqq->ref);
 		*async_bfqq = bfqq;
 	}
 
-	atomic_inc(&bfqq->ref);
-	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
-		     atomic_read(&bfqq->ref));
+out:
+	bfqq->ref++;
+	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
+	rcu_read_unlock();
 	return bfqq;
 }
 
 static void bfq_update_io_thinktime(struct bfq_data *bfqd,
 				    struct bfq_io_cq *bic)
 {
-	unsigned long elapsed = jiffies - bic->ttime.last_end_request;
-	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
+	struct bfq_ttime *ttime = &bic->ttime;
+	u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request;
 
-	bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
-	bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
-	bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) /
-				bic->ttime.ttime_samples;
+	elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle);
+
+	ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
+	ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed,  8);
+	ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
+				     ttime->ttime_samples);
 }
 
-static void bfq_update_io_seektime(struct bfq_data *bfqd,
-				   struct bfq_queue *bfqq,
-				   struct request *rq)
+static void
+bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+		       struct request *rq)
 {
-	sector_t sdist;
-	u64 total;
-
-	if (bfqq->last_request_pos < blk_rq_pos(rq))
-		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
-	else
-		sdist = bfqq->last_request_pos - blk_rq_pos(rq);
-
-	/*
-	 * Don't allow the seek distance to get too large from the
-	 * odd fragment, pagein, etc.
-	 */
-	if (bfqq->seek_samples == 0) /* first request, not really a seek */
-		sdist = 0;
-	else if (bfqq->seek_samples <= 60) /* second & third seek */
-		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
-	else
-		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
-
-	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
-	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
-	total = bfqq->seek_total + (bfqq->seek_samples/2);
-	do_div(total, bfqq->seek_samples);
-	bfqq->seek_mean = (sector_t)total;
-
-	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
-			(u64)bfqq->seek_mean);
+	bfqq->seek_history <<= 1;
+	bfqq->seek_history |=
+		get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
+		(!blk_queue_nonrot(bfqd->queue) ||
+		 blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
 }
 
 /*
@@ -3369,7 +4146,8 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,
 		return;
 
 	/* Idle window just restored, statistics are meaningless. */
-	if (bfq_bfqq_just_split(bfqq))
+	if (time_is_after_eq_jiffies(bfqq->split_time +
+				     bfqd->bfq_wr_min_idle_time))
 		return;
 
 	enable_idle = bfq_bfqq_idle_window(bfqq);
@@ -3409,22 +4187,13 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 
 	bfq_update_io_thinktime(bfqd, bic);
 	bfq_update_io_seektime(bfqd, bfqq, rq);
-	if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) {
-		bfq_clear_bfqq_constantly_seeky(bfqq);
-		if (!blk_queue_nonrot(bfqd->queue)) {
-			BUG_ON(!bfqd->const_seeky_busy_in_flight_queues);
-			bfqd->const_seeky_busy_in_flight_queues--;
-		}
-	}
 	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
 	    !BFQQ_SEEKY(bfqq))
 		bfq_update_idle_window(bfqd, bfqq, bic);
-	bfq_clear_bfqq_just_split(bfqq);
 
 	bfq_log_bfqq(bfqd, bfqq,
-		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
-		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
-		     (unsigned long long) bfqq->seek_mean);
+		     "rq_enqueued: idle_window=%d (seeky %d)",
+		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq));
 
 	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
 
@@ -3438,14 +4207,15 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		 * is small and the queue is not to be expired, then
 		 * just exit.
 		 *
-		 * In this way, if the disk is being idled to wait for
-		 * a new request from the in-service queue, we avoid
-		 * unplugging the device and committing the disk to serve
-		 * just a small request. On the contrary, we wait for
-		 * the block layer to decide when to unplug the device:
-		 * hopefully, new requests will be merged to this one
-		 * quickly, then the device will be unplugged and
-		 * larger requests will be dispatched.
+		 * In this way, if the device is being idled to wait
+		 * for a new request from the in-service queue, we
+		 * avoid unplugging the device and committing the
+		 * device to serve just a small request. On the
+		 * contrary, we wait for the block layer to decide
+		 * when to unplug the device: hopefully, new requests
+		 * will be merged to this one quickly, then the device
+		 * will be unplugged and larger requests will be
+		 * dispatched.
 		 */
 		if (small_req && !budget_timeout)
 			return;
@@ -3457,10 +4227,8 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		 * timer.
 		 */
 		bfq_clear_bfqq_wait_request(bfqq);
-		del_timer(&bfqd->idle_slice_timer);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
 		bfqg_stats_update_idle_time(bfqq_group(bfqq));
-#endif
 
 		/*
 		 * The queue is not empty, because a new request just
@@ -3504,28 +4272,20 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq)
 			 */
 			new_bfqq->allocated[rq_data_dir(rq)]++;
 			bfqq->allocated[rq_data_dir(rq)]--;
-			atomic_inc(&new_bfqq->ref);
+			new_bfqq->ref++;
+			bfq_clear_bfqq_just_created(bfqq);
 			bfq_put_queue(bfqq);
 			if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
 				bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
 						bfqq, new_bfqq);
 			rq->elv.priv[1] = new_bfqq;
 			bfqq = new_bfqq;
-		} else
-			bfq_bfqq_increase_failed_cooperations(bfqq);
+		}
 	}
 
 	bfq_add_request(rq);
 
-	/*
-	 * Here a newly-created bfq_queue has already started a weight-raising
-	 * period: clear raising_time_left to prevent bfq_bfqq_save_state()
-	 * from assigning it a full weight-raising period. See the detailed
-	 * comments about this field in bfq_init_icq().
-	 */
-	if (bfqq->bic)
-		bfqq->bic->wr_time_left = 0;
-	rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
+	rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
 	list_add_tail(&rq->queuelist, &bfqq->fifo);
 
 	bfq_rq_enqueued(bfqd, bfqq, rq);
@@ -3533,8 +4293,8 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq)
 
 static void bfq_update_hw_tag(struct bfq_data *bfqd)
 {
-	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
-				     bfqd->rq_in_driver);
+	bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
+				       bfqd->rq_in_driver);
 
 	if (bfqd->hw_tag == 1)
 		return;
@@ -3560,48 +4320,85 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)
 {
 	struct bfq_queue *bfqq = RQ_BFQQ(rq);
 	struct bfq_data *bfqd = bfqq->bfqd;
-	bool sync = bfq_bfqq_sync(bfqq);
+	u64 now_ns;
+	u32 delta_us;
 
-	bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)",
-		     blk_rq_sectors(rq), sync);
+	bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left",
+		     blk_rq_sectors(rq));
 
+	assert_spin_locked(bfqd->queue->queue_lock);
 	bfq_update_hw_tag(bfqd);
 
 	BUG_ON(!bfqd->rq_in_driver);
 	BUG_ON(!bfqq->dispatched);
 	bfqd->rq_in_driver--;
 	bfqq->dispatched--;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
 	bfqg_stats_update_completion(bfqq_group(bfqq),
 				     rq_start_time_ns(rq),
-				     rq_io_start_time_ns(rq), rq->cmd_flags);
-#endif
+				     rq_io_start_time_ns(rq), req_op(rq),
+				     rq->cmd_flags);
 
 	if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
+		BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
+		/*
+		 * Set budget_timeout (which we overload to store the
+		 * time at which the queue remains with no backlog and
+		 * no outstanding request; used by the weight-raising
+		 * mechanism).
+		 */
+		bfqq->budget_timeout = jiffies;
+
 		bfq_weights_tree_remove(bfqd, &bfqq->entity,
 					&bfqd->queue_weights_tree);
-		if (!blk_queue_nonrot(bfqd->queue)) {
-			BUG_ON(!bfqd->busy_in_flight_queues);
-			bfqd->busy_in_flight_queues--;
-			if (bfq_bfqq_constantly_seeky(bfqq)) {
-				BUG_ON(!bfqd->
-					const_seeky_busy_in_flight_queues);
-				bfqd->const_seeky_busy_in_flight_queues--;
-			}
-		}
 	}
 
-	if (sync) {
-		bfqd->sync_flight--;
-		RQ_BIC(rq)->ttime.last_end_request = jiffies;
-	}
+	now_ns = ktime_get_ns();
+
+	RQ_BIC(rq)->ttime.last_end_request = now_ns;
+
+	/*
+	 * Using us instead of ns, to get a reasonable precision in
+	 * computing rate in next check.
+	 */
+	delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC);
+
+	bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu",
+		delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size,
+		(USEC_PER_SEC*
+		(u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us))
+			>>BFQ_RATE_SHIFT,
+		(USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT);
+
+	/*
+	 * If the request took rather long to complete, and, according
+	 * to the maximum request size recorded, this completion latency
+	 * implies that the request was certainly served at a very low
+	 * rate (less than 1M sectors/sec), then the whole observation
+	 * interval that lasts up to this time instant cannot be a
+	 * valid time interval for computing a new peak rate.  Invoke
+	 * bfq_update_rate_reset to have the following three steps
+	 * taken:
+	 * - close the observation interval at the last (previous)
+	 *   request dispatch or completion
+	 * - compute rate, if possible, for that observation interval
+	 * - reset to zero samples, which will trigger a proper
+	 *   re-initialization of the observation interval on next
+	 *   dispatch
+	 */
+	if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC &&
+	   (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us <
+			1UL<<(BFQ_RATE_SHIFT - 10))
+		bfq_update_rate_reset(bfqd, NULL);
+	bfqd->last_completion = now_ns;
 
 	/*
-	 * If we are waiting to discover whether the request pattern of the
-	 * task associated with the queue is actually isochronous, and
-	 * both requisites for this condition to hold are satisfied, then
-	 * compute soft_rt_next_start (see the comments to the function
-	 * bfq_bfqq_softrt_next_start()).
+	 * If we are waiting to discover whether the request pattern
+	 * of the task associated with the queue is actually
+	 * isochronous, and both requisites for this condition to hold
+	 * are now satisfied, then compute soft_rt_next_start (see the
+	 * comments on the function bfq_bfqq_softrt_next_start()). We
+	 * schedule this delayed check when bfqq expires, if it still
+	 * has in-flight requests.
 	 */
 	if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
 	    RB_EMPTY_ROOT(&bfqq->sort_list))
@@ -3613,10 +4410,7 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)
 	 * or if we want to idle in case it has no pending requests.
 	 */
 	if (bfqd->in_service_queue == bfqq) {
-		if (bfq_bfqq_budget_new(bfqq))
-			bfq_set_budget_timeout(bfqd);
-
-		if (bfq_bfqq_must_idle(bfqq)) {
+		if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {
 			bfq_arm_slice_timer(bfqd);
 			goto out;
 		} else if (bfq_may_expire_for_budg_timeout(bfqq))
@@ -3646,7 +4440,7 @@ static int __bfq_may_queue(struct bfq_queue *bfqq)
 	return ELV_MQUEUE_MAY;
 }
 
-static int bfq_may_queue(struct request_queue *q, int rw)
+static int bfq_may_queue(struct request_queue *q, int op, int op_flags)
 {
 	struct bfq_data *bfqd = q->elevator->elevator_data;
 	struct task_struct *tsk = current;
@@ -3663,7 +4457,7 @@ static int bfq_may_queue(struct request_queue *q, int rw)
 	if (!bic)
 		return ELV_MQUEUE_MAY;
 
-	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
+	bfqq = bic_to_bfqq(bic, rw_is_sync(op, op_flags));
 	if (bfqq)
 		return __bfq_may_queue(bfqq);
 
@@ -3687,14 +4481,14 @@ static void bfq_put_request(struct request *rq)
 		rq->elv.priv[1] = NULL;
 
 		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
-			     bfqq, atomic_read(&bfqq->ref));
+			     bfqq, bfqq->ref);
 		bfq_put_queue(bfqq);
 	}
 }
 
 /*
  * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
- * was the last process referring to said bfqq.
+ * was the last process referring to that bfqq.
  */
 static struct bfq_queue *
 bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
@@ -3732,11 +4526,8 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
 	unsigned long flags;
 	bool split = false;
 
-	might_sleep_if(gfpflags_allow_blocking(gfp_mask));
-
-	bfq_check_ioprio_change(bic, bio);
-
 	spin_lock_irqsave(q->queue_lock, flags);
+	bfq_check_ioprio_change(bic, bio);
 
 	if (!bic)
 		goto queue_fail;
@@ -3746,23 +4537,47 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
 new_queue:
 	bfqq = bic_to_bfqq(bic, is_sync);
 	if (!bfqq || bfqq == &bfqd->oom_bfqq) {
-		bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask);
+		if (bfqq)
+			bfq_put_queue(bfqq);
+		bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
+		BUG_ON(!hlist_unhashed(&bfqq->burst_list_node));
+
 		bic_set_bfqq(bic, bfqq, is_sync);
 		if (split && is_sync) {
+			bfq_log_bfqq(bfqd, bfqq,
+				     "set_request: was_in_list %d "
+				     "was_in_large_burst %d "
+				     "large burst in progress %d",
+				     bic->was_in_burst_list,
+				     bic->saved_in_large_burst,
+				     bfqd->large_burst);
+
 			if ((bic->was_in_burst_list && bfqd->large_burst) ||
-			    bic->saved_in_large_burst)
+			    bic->saved_in_large_burst) {
+				bfq_log_bfqq(bfqd, bfqq,
+					     "set_request: marking in "
+					     "large burst");
 				bfq_mark_bfqq_in_large_burst(bfqq);
-			else {
+			} else {
+				bfq_log_bfqq(bfqd, bfqq,
+					     "set_request: clearing in "
+					     "large burst");
 				bfq_clear_bfqq_in_large_burst(bfqq);
 				if (bic->was_in_burst_list)
 					hlist_add_head(&bfqq->burst_list_node,
 						       &bfqd->burst_list);
 			}
+			bfqq->split_time = jiffies;
 		}
 	} else {
 		/* If the queue was seeky for too long, break it apart. */
 		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
 			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
+
+			/* Update bic before losing reference to bfqq */
+			if (bfq_bfqq_in_large_burst(bfqq))
+				bic->saved_in_large_burst = true;
+
 			bfqq = bfq_split_bfqq(bic, bfqq);
 			split = true;
 			if (!bfqq)
@@ -3771,9 +4586,8 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
 	}
 
 	bfqq->allocated[rw]++;
-	atomic_inc(&bfqq->ref);
-	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
-		     atomic_read(&bfqq->ref));
+	bfqq->ref++;
+	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref);
 
 	rq->elv.priv[0] = bic;
 	rq->elv.priv[1] = bfqq;
@@ -3788,7 +4602,6 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
 	if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {
 		bfqq->bic = bic;
 		if (split) {
-			bfq_mark_bfqq_just_split(bfqq);
 			/*
 			 * If the queue has just been split from a shared
 			 * queue, restore the idle window and the possible
@@ -3798,6 +4611,9 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
 		}
 	}
 
+	if (unlikely(bfq_bfqq_just_created(bfqq)))
+		bfq_handle_burst(bfqd, bfqq);
+
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
 	return 0;
@@ -3824,9 +4640,10 @@ static void bfq_kick_queue(struct work_struct *work)
  * Handler of the expiration of the timer running if the in-service queue
  * is idling inside its time slice.
  */
-static void bfq_idle_slice_timer(unsigned long data)
+static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
 {
-	struct bfq_data *bfqd = (struct bfq_data *)data;
+	struct bfq_data *bfqd = container_of(timer, struct bfq_data,
+					     idle_slice_timer);
 	struct bfq_queue *bfqq;
 	unsigned long flags;
 	enum bfqq_expiration reason;
@@ -3844,6 +4661,8 @@ static void bfq_idle_slice_timer(unsigned long data)
 	 */
 	if (bfqq) {
 		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
+		bfq_clear_bfqq_wait_request(bfqq);
+
 		if (bfq_bfqq_budget_timeout(bfqq))
 			/*
 			 * Also here the queue can be safely expired
@@ -3869,14 +4688,16 @@ static void bfq_idle_slice_timer(unsigned long data)
 	bfq_schedule_dispatch(bfqd);
 
 	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
+	return HRTIMER_NORESTART;
 }
 
 static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
 {
-	del_timer_sync(&bfqd->idle_slice_timer);
+	hrtimer_cancel(&bfqd->idle_slice_timer);
 	cancel_work_sync(&bfqd->unplug_work);
 }
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
 static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
 					struct bfq_queue **bfqq_ptr)
 {
@@ -3885,9 +4706,9 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
 
 	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
 	if (bfqq) {
-		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
+		bfq_bfqq_move(bfqd, bfqq, root_group);
 		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
-			     bfqq, atomic_read(&bfqq->ref));
+			     bfqq, bfqq->ref);
 		bfq_put_queue(bfqq);
 		*bfqq_ptr = NULL;
 	}
@@ -3909,6 +4730,7 @@ static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
 
 	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
 }
+#endif
 
 static void bfq_exit_queue(struct elevator_queue *e)
 {
@@ -3922,15 +4744,13 @@ static void bfq_exit_queue(struct elevator_queue *e)
 
 	BUG_ON(bfqd->in_service_queue);
 	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
-		bfq_deactivate_bfqq(bfqd, bfqq, 0);
+		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
 
 	spin_unlock_irq(q->queue_lock);
 
 	bfq_shutdown_timer_wq(bfqd);
 
-	synchronize_rcu();
-
-	BUG_ON(timer_pending(&bfqd->idle_slice_timer));
+	BUG_ON(hrtimer_active(&bfqd->idle_slice_timer));
 
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 	blkcg_deactivate_policy(q, &blkcg_policy_bfq);
@@ -3954,6 +4774,7 @@ static void bfq_init_root_group(struct bfq_group *root_group,
 	root_group->rq_pos_tree = RB_ROOT;
 	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
 		root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+	root_group->sched_data.bfq_class_idle_last_service = jiffies;
 }
 
 static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
@@ -3978,11 +4799,14 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	 * will not attempt to free it.
 	 */
 	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
-	atomic_inc(&bfqd->oom_bfqq.ref);
+	bfqd->oom_bfqq.ref++;
 	bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
 	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
 	bfqd->oom_bfqq.entity.new_weight =
 		bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
+
+	/* oom_bfqq does not participate to bursts */
+	bfq_clear_bfqq_just_created(&bfqd->oom_bfqq);
 	/*
 	 * Trigger weight initialization, according to ioprio, at the
 	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
@@ -4001,13 +4825,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 		goto out_free;
 	bfq_init_root_group(bfqd->root_group, bfqd);
 	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	bfqd->active_numerous_groups = 0;
-#endif
 
-	init_timer(&bfqd->idle_slice_timer);
+	hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_REL);
 	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
-	bfqd->idle_slice_timer.data = (unsigned long)bfqd;
 
 	bfqd->queue_weights_tree = RB_ROOT;
 	bfqd->group_weights_tree = RB_ROOT;
@@ -4027,21 +4848,19 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	bfqd->bfq_back_max = bfq_back_max;
 	bfqd->bfq_back_penalty = bfq_back_penalty;
 	bfqd->bfq_slice_idle = bfq_slice_idle;
-	bfqd->bfq_class_idle_last_service = 0;
-	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
-	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
-	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
+	bfqd->bfq_timeout = bfq_timeout;
 
-	bfqd->bfq_coop_thresh = 2;
-	bfqd->bfq_failed_cooperations = 7000;
 	bfqd->bfq_requests_within_timer = 120;
 
-	bfqd->bfq_large_burst_thresh = 11;
-	bfqd->bfq_burst_interval = msecs_to_jiffies(500);
+	bfqd->bfq_large_burst_thresh = 8;
+	bfqd->bfq_burst_interval = msecs_to_jiffies(180);
 
 	bfqd->low_latency = true;
 
-	bfqd->bfq_wr_coeff = 20;
+	/*
+	 * Trade-off between responsiveness and fairness.
+	 */
+	bfqd->bfq_wr_coeff = 30;
 	bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
 	bfqd->bfq_wr_max_time = 0;
 	bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
@@ -4053,16 +4872,15 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 					      * video.
 					      */
 	bfqd->wr_busy_queues = 0;
-	bfqd->busy_in_flight_queues = 0;
-	bfqd->const_seeky_busy_in_flight_queues = 0;
 
 	/*
-	 * Begin by assuming, optimistically, that the device peak rate is
-	 * equal to the highest reference rate.
+	 * Begin by assuming, optimistically, that the device is a
+	 * high-speed one, and that its peak rate is equal to 2/3 of
+	 * the highest reference rate.
 	 */
 	bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
 			T_fast[blk_queue_nonrot(bfqd->queue)];
-	bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)];
+	bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
 	bfqd->device_speed = BFQ_BFQD_FAST;
 
 	return 0;
@@ -4088,7 +4906,7 @@ static int __init bfq_slab_setup(void)
 
 static ssize_t bfq_var_show(unsigned int var, char *page)
 {
-	return sprintf(page, "%d\n", var);
+	return sprintf(page, "%u\n", var);
 }
 
 static ssize_t bfq_var_store(unsigned long *var, const char *page,
@@ -4159,21 +4977,21 @@ static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
 static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
 {									\
 	struct bfq_data *bfqd = e->elevator_data;			\
-	unsigned int __data = __VAR;					\
-	if (__CONV)							\
+	u64 __data = __VAR;						\
+	if (__CONV == 1)						\
 		__data = jiffies_to_msecs(__data);			\
+	else if (__CONV == 2)						\
+		__data = div_u64(__data, NSEC_PER_MSEC);		\
 	return bfq_var_show(__data, (page));				\
 }
-SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
-SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2);
+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2);
 SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
 SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
-SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
 SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
-SHOW_FUNCTION(bfq_max_budget_async_rq_show,
-	      bfqd->bfq_max_budget_async_rq, 0);
-SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
-SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
+SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
 SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
 SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0);
 SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1);
@@ -4183,6 +5001,17 @@ SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async,
 SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0);
 #undef SHOW_FUNCTION
 
+#define USEC_SHOW_FUNCTION(__FUNC, __VAR)				\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
+{									\
+	struct bfq_data *bfqd = e->elevator_data;			\
+	u64 __data = __VAR;						\
+	__data = div_u64(__data, NSEC_PER_USEC);			\
+	return bfq_var_show(__data, (page));				\
+}
+USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle);
+#undef USEC_SHOW_FUNCTION
+
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
 static ssize_t								\
 __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
@@ -4194,24 +5023,22 @@ __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
 		__data = (MIN);						\
 	else if (__data > (MAX))					\
 		__data = (MAX);						\
-	if (__CONV)							\
+	if (__CONV == 1)						\
 		*(__PTR) = msecs_to_jiffies(__data);			\
+	else if (__CONV == 2)						\
+		*(__PTR) = (u64)__data * NSEC_PER_MSEC;			\
 	else								\
 		*(__PTR) = __data;					\
 	return ret;							\
 }
 STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
-		INT_MAX, 1);
+		INT_MAX, 2);
 STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
-		INT_MAX, 1);
+		INT_MAX, 2);
 STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
 STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
 		INT_MAX, 0);
-STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
-STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
-		1, INT_MAX, 0);
-STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
-		INT_MAX, 1);
+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2);
 STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0);
 STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1);
 STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX,
@@ -4224,6 +5051,23 @@ STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0,
 		INT_MAX, 0);
 #undef STORE_FUNCTION
 
+#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)			\
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
+{									\
+	struct bfq_data *bfqd = e->elevator_data;			\
+	unsigned long uninitialized_var(__data);			\
+	int ret = bfq_var_store(&__data, (page), count);		\
+	if (__data < (MIN))						\
+		__data = (MIN);						\
+	else if (__data > (MAX))					\
+		__data = (MAX);						\
+	*(__PTR) = (u64)__data * NSEC_PER_USEC;				\
+	return ret;							\
+}
+USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
+		    UINT_MAX);
+#undef USEC_STORE_FUNCTION
+
 /* do nothing for the moment */
 static ssize_t bfq_weights_store(struct elevator_queue *e,
 				    const char *page, size_t count)
@@ -4231,16 +5075,6 @@ static ssize_t bfq_weights_store(struct elevator_queue *e,
 	return count;
 }
 
-static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
-{
-	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
-
-	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
-		return bfq_calc_max_budget(bfqd->peak_rate, timeout);
-	else
-		return bfq_default_max_budget;
-}
-
 static ssize_t bfq_max_budget_store(struct elevator_queue *e,
 				    const char *page, size_t count)
 {
@@ -4249,7 +5083,7 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e,
 	int ret = bfq_var_store(&__data, (page), count);
 
 	if (__data == 0)
-		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
+		bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
 	else {
 		if (__data > INT_MAX)
 			__data = INT_MAX;
@@ -4261,6 +5095,10 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e,
 	return ret;
 }
 
+/*
+ * Leaving this name to preserve name compatibility with cfq
+ * parameters, but this timeout is used for both sync and async.
+ */
 static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
 				      const char *page, size_t count)
 {
@@ -4273,9 +5111,27 @@ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
 	else if (__data > INT_MAX)
 		__data = INT_MAX;
 
-	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
+	bfqd->bfq_timeout = msecs_to_jiffies(__data);
 	if (bfqd->bfq_user_max_budget == 0)
-		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
+		bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
+
+	return ret;
+}
+
+static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
+				     const char *page, size_t count)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	unsigned long uninitialized_var(__data);
+	int ret = bfq_var_store(&__data, (page), count);
+
+	if (__data > 1)
+		__data = 1;
+	if (!bfqd->strict_guarantees && __data == 1
+	    && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC)
+		bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC;
+
+	bfqd->strict_guarantees = __data;
 
 	return ret;
 }
@@ -4305,10 +5161,10 @@ static struct elv_fs_entry bfq_attrs[] = {
 	BFQ_ATTR(back_seek_max),
 	BFQ_ATTR(back_seek_penalty),
 	BFQ_ATTR(slice_idle),
+	BFQ_ATTR(slice_idle_us),
 	BFQ_ATTR(max_budget),
-	BFQ_ATTR(max_budget_async_rq),
 	BFQ_ATTR(timeout_sync),
-	BFQ_ATTR(timeout_async),
+	BFQ_ATTR(strict_guarantees),
 	BFQ_ATTR(low_latency),
 	BFQ_ATTR(wr_coeff),
 	BFQ_ATTR(wr_max_time),
@@ -4328,7 +5184,8 @@ static struct elevator_type iosched_bfq = {
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 		.elevator_bio_merged_fn =	bfq_bio_merged,
 #endif
-		.elevator_allow_merge_fn =	bfq_allow_merge,
+		.elevator_allow_bio_merge_fn =	bfq_allow_bio_merge,
+		.elevator_allow_rq_merge_fn =	bfq_allow_rq_merge,
 		.elevator_dispatch_fn =		bfq_dispatch_requests,
 		.elevator_add_req_fn =		bfq_insert_request,
 		.elevator_activate_req_fn =	bfq_activate_request,
@@ -4351,18 +5208,28 @@ static struct elevator_type iosched_bfq = {
 	.elevator_owner =	THIS_MODULE,
 };
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static struct blkcg_policy blkcg_policy_bfq = {
+	.dfl_cftypes		= bfq_blkg_files,
+	.legacy_cftypes		= bfq_blkcg_legacy_files,
+
+	.cpd_alloc_fn		= bfq_cpd_alloc,
+	.cpd_init_fn		= bfq_cpd_init,
+	.cpd_bind_fn	        = bfq_cpd_init,
+	.cpd_free_fn		= bfq_cpd_free,
+
+	.pd_alloc_fn		= bfq_pd_alloc,
+	.pd_init_fn		= bfq_pd_init,
+	.pd_offline_fn		= bfq_pd_offline,
+	.pd_free_fn		= bfq_pd_free,
+	.pd_reset_stats_fn	= bfq_pd_reset_stats,
+};
+#endif
+
 static int __init bfq_init(void)
 {
 	int ret;
-
-	/*
-	 * Can be 0 on HZ < 1000 setups.
-	 */
-	if (bfq_slice_idle == 0)
-		bfq_slice_idle = 1;
-
-	if (bfq_timeout_async == 0)
-		bfq_timeout_async = 1;
+	char msg[60] = "BFQ I/O-scheduler: v8r7";
 
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 	ret = blkcg_policy_register(&blkcg_policy_bfq);
@@ -4375,27 +5242,46 @@ static int __init bfq_init(void)
 		goto err_pol_unreg;
 
 	/*
-	 * Times to load large popular applications for the typical systems
-	 * installed on the reference devices (see the comments before the
-	 * definitions of the two arrays).
+	 * Times to load large popular applications for the typical
+	 * systems installed on the reference devices (see the
+	 * comments before the definitions of the next two
+	 * arrays). Actually, we use slightly slower values, as the
+	 * estimated peak rate tends to be smaller than the actual
+	 * peak rate.  The reason for this last fact is that estimates
+	 * are computed over much shorter time intervals than the long
+	 * intervals typically used for benchmarking. Why? First, to
+	 * adapt more quickly to variations. Second, because an I/O
+	 * scheduler cannot rely on a peak-rate-evaluation workload to
+	 * be run for a long time.
 	 */
-	T_slow[0] = msecs_to_jiffies(2600);
-	T_slow[1] = msecs_to_jiffies(1000);
-	T_fast[0] = msecs_to_jiffies(5500);
-	T_fast[1] = msecs_to_jiffies(2000);
+	T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */
+	T_slow[1] = msecs_to_jiffies(1000); /* actually 1.5 sec */
+	T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */
+	T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */
 
 	/*
-	 * Thresholds that determine the switch between speed classes (see
-	 * the comments before the definition of the array).
+	 * Thresholds that determine the switch between speed classes
+	 * (see the comments before the definition of the array
+	 * device_speed_thresh). These thresholds are biased towards
+	 * transitions to the fast class. This is safer than the
+	 * opposite bias. In fact, a wrong transition to the slow
+	 * class results in short weight-raising periods, because the
+	 * speed of the device then tends to be higher that the
+	 * reference peak rate. On the opposite end, a wrong
+	 * transition to the fast class tends to increase
+	 * weight-raising periods, because of the opposite reason.
 	 */
-	device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2;
-	device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2;
+	device_speed_thresh[0] = (4 * R_slow[0]) / 3;
+	device_speed_thresh[1] = (4 * R_slow[1]) / 3;
 
 	ret = elv_register(&iosched_bfq);
 	if (ret)
 		goto err_pol_unreg;
 
-	pr_info("BFQ I/O-scheduler: v7r11");
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	strcat(msg, " (with cgroups support)");
+#endif
+	pr_info("%s", msg);
 
 	return 0;
 
diff --git a/block/bfq-sched.c b/block/bfq-sched.c
index a5ed694..797bce7 100644
--- a/block/bfq-sched.c
+++ b/block/bfq-sched.c
@@ -7,28 +7,166 @@
  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
  *		      Paolo Valente <paolo.valente@unimore.it>
  *
- * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org>
+ */
+
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+
+/**
+ * bfq_gt - compare two timestamps.
+ * @a: first ts.
+ * @b: second ts.
+ *
+ * Return @a > @b, dealing with wrapping correctly.
+ */
+static int bfq_gt(u64 a, u64 b)
+{
+	return (s64)(a - b) > 0;
+}
+
+static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree)
+{
+	struct rb_node *node = tree->rb_node;
+
+	return rb_entry(node, struct bfq_entity, rb_node);
+}
+
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd);
+
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
+
+/**
+ * bfq_update_next_in_service - update sd->next_in_service
+ * @sd: sched_data for which to perform the update.
+ * @new_entity: if not NULL, pointer to the entity whose activation,
+ *		requeueing or repositionig triggered the invocation of
+ *		this function.
+ *
+ * This function is called to update sd->next_in_service, which, in
+ * its turn, may change as a consequence of the insertion or
+ * extraction of an entity into/from one of the active trees of
+ * sd. These insertions/extractions occur as a consequence of
+ * activations/deactivations of entities, with some activations being
+ * 'true' activations, and other activations being requeueings (i.e.,
+ * implementing the second, requeueing phase of the mechanism used to
+ * reposition an entity in its active tree; see comments on
+ * __bfq_activate_entity and __bfq_requeue_entity for details). In
+ * both the last two activation sub-cases, new_entity points to the
+ * just activated or requeued entity.
+ *
+ * Returns true if sd->next_in_service changes in such a way that
+ * entity->parent may become the next_in_service for its parent
+ * entity.
  */
+static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
+				       struct bfq_entity *new_entity)
+{
+	struct bfq_entity *next_in_service = sd->next_in_service;
+	struct bfq_queue *bfqq;
+	bool parent_sched_may_change = false;
+
+	/*
+	 * If this update is triggered by the activation, requeueing
+	 * or repositiong of an entity that does not coincide with
+	 * sd->next_in_service, then a full lookup in the active tree
+	 * can be avoided. In fact, it is enough to check whether the
+	 * just-modified entity has a higher priority than
+	 * sd->next_in_service, or, even if it has the same priority
+	 * as sd->next_in_service, is eligible and has a lower virtual
+	 * finish time than sd->next_in_service. If this compound
+	 * condition holds, then the new entity becomes the new
+	 * next_in_service. Otherwise no change is needed.
+	 */
+	if (new_entity && new_entity != sd->next_in_service) {
+		/*
+		 * Flag used to decide whether to replace
+		 * sd->next_in_service with new_entity. Tentatively
+		 * set to true, and left as true if
+		 * sd->next_in_service is NULL.
+		 */
+		bool replace_next = true;
+
+		/*
+		 * If there is already a next_in_service candidate
+		 * entity, then compare class priorities or timestamps
+		 * to decide whether to replace sd->service_tree with
+		 * new_entity.
+		 */
+		if (next_in_service) {
+			unsigned int new_entity_class_idx =
+				bfq_class_idx(new_entity);
+			struct bfq_service_tree *st =
+				sd->service_tree + new_entity_class_idx;
+
+			/*
+			 * For efficiency, evaluate the most likely
+			 * sub-condition first.
+			 */
+			replace_next =
+				(new_entity_class_idx ==
+				 bfq_class_idx(next_in_service)
+				 &&
+				 !bfq_gt(new_entity->start, st->vtime)
+				 &&
+				 bfq_gt(next_in_service->finish,
+					new_entity->finish))
+				||
+				new_entity_class_idx <
+				bfq_class_idx(next_in_service);
+		}
+
+		if (replace_next)
+			next_in_service = new_entity;
+	} else /* invoked because of a deactivation: lookup needed */
+		next_in_service = bfq_lookup_next_entity(sd);
+
+	if (next_in_service) {
+		parent_sched_may_change = !sd->next_in_service ||
+			bfq_update_parent_budget(next_in_service);
+	}
+
+	sd->next_in_service = next_in_service;
+
+	if (!next_in_service)
+		return parent_sched_may_change;
 
+	bfqq = bfq_entity_to_bfqq(next_in_service);
+	if (bfqq)
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			     "update_next_in_service: chosen this queue");
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
-#define for_each_entity(entity)	\
+	else {
+		struct bfq_group *bfqg =
+			container_of(next_in_service,
+				     struct bfq_group, entity);
+
+		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+			     "update_next_in_service: chosen this entity");
+	}
+#endif
+	return parent_sched_may_change;
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+/* both next loops stop at one of the child entities of the root group */
+#define for_each_entity(entity)				\
 	for (; entity ; entity = entity->parent)
 
 #define for_each_entity_safe(entity, parent) \
 	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
 
-
-static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
-						 int extract,
-						 struct bfq_data *bfqd);
-
-static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
-
-static void bfq_update_budget(struct bfq_entity *next_in_service)
+/*
+ * Returns true if this budget changes may let next_in_service->parent
+ * become the next_in_service entity for its parent entity.
+ */
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
 {
 	struct bfq_entity *bfqg_entity;
 	struct bfq_group *bfqg;
 	struct bfq_sched_data *group_sd;
+	bool ret = false;
 
 	BUG_ON(!next_in_service);
 
@@ -41,60 +179,68 @@ static void bfq_update_budget(struct bfq_entity *next_in_service)
 	 * as it must never become an in-service entity.
 	 */
 	bfqg_entity = bfqg->my_entity;
-	if (bfqg_entity)
+	if (bfqg_entity) {
+		if (bfqg_entity->budget > next_in_service->budget)
+			ret = true;
 		bfqg_entity->budget = next_in_service->budget;
+	}
+
+	return ret;
 }
 
-static int bfq_update_next_in_service(struct bfq_sched_data *sd)
+/*
+ * This function tells whether entity stops being a candidate for next
+ * service, according to the following logic.
+ *
+ * This function is invoked for an entity that is about to be set in
+ * service. If such an entity is a queue, then the entity is no longer
+ * a candidate for next service (i.e, a candidate entity to serve
+ * after the in-service entity is expired). The function then returns
+ * true.
+ *
+ * In contrast, the entity could stil be a candidate for next service
+ * if it is not a queue, and has more than one child. In fact, even if
+ * one of its children is about to be set in service, other children
+ * may still be the next to serve. As a consequence, a non-queue
+ * entity is not a candidate for next-service only if it has only one
+ * child. And only if this condition holds, then the function returns
+ * true for a non-queue entity.
+ */
+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
 {
-	struct bfq_entity *next_in_service;
+	struct bfq_group *bfqg;
 
-	if (sd->in_service_entity)
-		/* will update/requeue at the end of service */
-		return 0;
+	if (bfq_entity_to_bfqq(entity))
+		return true;
 
-	/*
-	 * NOTE: this can be improved in many ways, such as returning
-	 * 1 (and thus propagating upwards the update) only when the
-	 * budget changes, or caching the bfqq that will be scheduled
-	 * next from this subtree.  By now we worry more about
-	 * correctness than about performance...
-	 */
-	next_in_service = bfq_lookup_next_entity(sd, 0, NULL);
-	sd->next_in_service = next_in_service;
+	bfqg = container_of(entity, struct bfq_group, entity);
 
-	if (next_in_service)
-		bfq_update_budget(next_in_service);
+	BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group);
+	BUG_ON(bfqg->active_entities == 0);
+	if (bfqg->active_entities == 1)
+		return true;
 
-	return 1;
+	return false;
 }
 
-static void bfq_check_next_in_service(struct bfq_sched_data *sd,
-				      struct bfq_entity *entity)
-{
-	BUG_ON(sd->next_in_service != entity);
-}
-#else
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
 #define for_each_entity(entity)	\
 	for (; entity ; entity = NULL)
 
 #define for_each_entity_safe(entity, parent) \
 	for (parent = NULL; entity ; entity = parent)
 
-static int bfq_update_next_in_service(struct bfq_sched_data *sd)
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
 {
-	return 0;
+	return false;
 }
 
-static void bfq_check_next_in_service(struct bfq_sched_data *sd,
-				      struct bfq_entity *entity)
+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
 {
+	return true;
 }
 
-static void bfq_update_budget(struct bfq_entity *next_in_service)
-{
-}
-#endif
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
 
 /*
  * Shift for timestamp calculations.  This actually limits the maximum
@@ -105,18 +251,6 @@ static void bfq_update_budget(struct bfq_entity *next_in_service)
  */
 #define WFQ_SERVICE_SHIFT	22
 
-/**
- * bfq_gt - compare two timestamps.
- * @a: first ts.
- * @b: second ts.
- *
- * Return @a > @b, dealing with wrapping correctly.
- */
-static int bfq_gt(u64 a, u64 b)
-{
-	return (s64)(a - b) > 0;
-}
-
 static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
 {
 	struct bfq_queue *bfqq = NULL;
@@ -151,20 +285,36 @@ static u64 bfq_delta(unsigned long service, unsigned long weight)
 static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	unsigned long long start, finish, delta;
 
 	BUG_ON(entity->weight == 0);
 
 	entity->finish = entity->start +
 		bfq_delta(service, entity->weight);
 
+	start = ((entity->start>>10)*1000)>>12;
+	finish = ((entity->finish>>10)*1000)>>12;
+	delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12;
+
 	if (bfqq) {
 		bfq_log_bfqq(bfqq->bfqd, bfqq,
 			"calc_finish: serv %lu, w %d",
 			service, entity->weight);
 		bfq_log_bfqq(bfqq->bfqd, bfqq,
 			"calc_finish: start %llu, finish %llu, delta %llu",
-			entity->start, entity->finish,
-			bfq_delta(service, entity->weight));
+			start, finish, delta);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	} else {
+		struct bfq_group *bfqg =
+			container_of(entity, struct bfq_group, entity);
+
+		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+			"calc_finish group: serv %lu, w %d",
+			     service, entity->weight);
+		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+			"calc_finish group: start %llu, finish %llu, delta %llu",
+			start, finish, delta);
+#endif
 	}
 }
 
@@ -293,10 +443,26 @@ static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
 static void bfq_update_active_node(struct rb_node *node)
 {
 	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 
 	entity->min_start = entity->start;
 	bfq_update_min(entity, node->rb_right);
 	bfq_update_min(entity, node->rb_left);
+
+	if (bfqq) {
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			     "update_active_node: new min_start %llu",
+			     ((entity->min_start>>10)*1000)>>12);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	} else {
+		struct bfq_group *bfqg =
+			container_of(entity, struct bfq_group, entity);
+
+		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+			     "update_active_node: new min_start %llu",
+			     ((entity->min_start>>10)*1000)>>12);
+#endif
+	}
 }
 
 /**
@@ -386,8 +552,6 @@ static void bfq_active_insert(struct bfq_service_tree *st,
 		BUG_ON(!bfqg);
 		BUG_ON(!bfqd);
 		bfqg->active_entities++;
-		if (bfqg->active_entities == 2)
-			bfqd->active_numerous_groups++;
 	}
 #endif
 }
@@ -399,7 +563,7 @@ static void bfq_active_insert(struct bfq_service_tree *st,
 static unsigned short bfq_ioprio_to_weight(int ioprio)
 {
 	BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
-	return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio;
+	return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
 }
 
 /**
@@ -422,9 +586,9 @@ static void bfq_get_entity(struct bfq_entity *entity)
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 
 	if (bfqq) {
-		atomic_inc(&bfqq->ref);
+		bfqq->ref++;
 		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
-			     bfqq, atomic_read(&bfqq->ref));
+			     bfqq, bfqq->ref);
 	}
 }
 
@@ -499,10 +663,6 @@ static void bfq_active_extract(struct bfq_service_tree *st,
 		BUG_ON(!bfqd);
 		BUG_ON(!bfqg->active_entities);
 		bfqg->active_entities--;
-		if (bfqg->active_entities == 1) {
-			BUG_ON(!bfqd->active_numerous_groups);
-			bfqd->active_numerous_groups--;
-		}
 	}
 #endif
 }
@@ -547,12 +707,12 @@ static void bfq_forget_entity(struct bfq_service_tree *st,
 
 	BUG_ON(!entity->on_st);
 
-	entity->on_st = 0;
+	entity->on_st = false;
 	st->wsum -= entity->weight;
 	if (bfqq) {
 		sd = entity->sched_data;
 		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
-			     bfqq, atomic_read(&bfqq->ref));
+			     bfqq, bfqq->ref);
 		bfq_put_queue(bfqq);
 	}
 }
@@ -602,7 +762,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 
 	if (entity->prio_changed) {
 		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-		unsigned short prev_weight, new_weight;
+		unsigned int prev_weight, new_weight;
 		struct bfq_data *bfqd = NULL;
 		struct rb_root *root;
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
@@ -630,7 +790,10 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 			    entity->new_weight > BFQ_MAX_WEIGHT) {
 				pr_crit("update_weight_prio: new_weight %d\n",
 					entity->new_weight);
-				BUG();
+				if (entity->new_weight < BFQ_MIN_WEIGHT)
+					entity->new_weight = BFQ_MIN_WEIGHT;
+				else
+					entity->new_weight = BFQ_MAX_WEIGHT;
 			}
 			entity->orig_weight = entity->new_weight;
 			if (bfqq)
@@ -661,6 +824,13 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 		 * associated with its new weight.
 		 */
 		if (prev_weight != new_weight) {
+			if (bfqq)
+				bfq_log_bfqq(bfqq->bfqd, bfqq,
+					     "weight changed %d %d(%d %d)",
+					     prev_weight, new_weight,
+					     entity->orig_weight,
+					     bfqq->wr_coeff);
+
 			root = bfqq ? &bfqd->queue_weights_tree :
 				      &bfqd->group_weights_tree;
 			bfq_weights_tree_remove(bfqd, entity, root);
@@ -707,7 +877,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
 		st = bfq_entity_service_tree(entity);
 
 		entity->service += served;
-		BUG_ON(entity->service > entity->budget);
+
 		BUG_ON(st->wsum == 0);
 
 		st->vtime += bfq_delta(served, st->wsum);
@@ -716,170 +886,419 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 	bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
 #endif
-	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
+	st = bfq_entity_service_tree(&bfqq->entity);
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p",
+		     served,  ((st->vtime>>10)*1000)>>12, st);
 }
 
 /**
- * bfq_bfqq_charge_full_budget - set the service to the entity budget.
+ * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
+ *			  of the time interval during which bfqq has been in
+ *			  service.
+ * @bfqd: the device
  * @bfqq: the queue that needs a service update.
+ * @time_ms: the amount of time during which the queue has received service
+ *
+ * If a queue does not consume its budget fast enough, then providing
+ * the queue with service fairness may impair throughput, more or less
+ * severely. For this reason, queues that consume their budget slowly
+ * are provided with time fairness instead of service fairness. This
+ * goal is achieved through the BFQ scheduling engine, even if such an
+ * engine works in the service, and not in the time domain. The trick
+ * is charging these queues with an inflated amount of service, equal
+ * to the amount of service that they would have received during their
+ * service slot if they had been fast, i.e., if their requests had
+ * been dispatched at a rate equal to the estimated peak rate.
  *
- * When it's not possible to be fair in the service domain, because
- * a queue is not consuming its budget fast enough (the meaning of
- * fast depends on the timeout parameter), we charge it a full
- * budget.  In this way we should obtain a sort of time-domain
- * fairness among all the seeky/slow queues.
+ * It is worth noting that time fairness can cause important
+ * distortions in terms of bandwidth distribution, on devices with
+ * internal queueing. The reason is that I/O requests dispatched
+ * during the service slot of a queue may be served after that service
+ * slot is finished, and may have a total processing time loosely
+ * correlated with the duration of the service slot. This is
+ * especially true for short service slots.
  */
-static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
+static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+				 unsigned long time_ms)
 {
 	struct bfq_entity *entity = &bfqq->entity;
+	int tot_serv_to_charge = entity->service;
+	unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout);
+
+	if (time_ms > 0 && time_ms < timeout_ms)
+		tot_serv_to_charge =
+			(bfqd->bfq_max_budget * time_ms) / timeout_ms;
 
-	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
+	if (tot_serv_to_charge < entity->service)
+		tot_serv_to_charge = entity->service;
 
-	bfq_bfqq_served(bfqq, entity->budget - entity->service);
+	bfq_log_bfqq(bfqq->bfqd, bfqq,
+		     "charge_time: %lu/%u ms, %d/%d/%d sectors",
+		     time_ms, timeout_ms, entity->service,
+		     tot_serv_to_charge, entity->budget);
+
+	/* Increase budget to avoid inconsistencies */
+	if (tot_serv_to_charge > entity->budget)
+		entity->budget = tot_serv_to_charge;
+
+	bfq_bfqq_served(bfqq,
+			max_t(int, 0, tot_serv_to_charge - entity->service));
+}
+
+static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
+					struct bfq_service_tree *st,
+					bool backshifted)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct bfq_sched_data *sd = entity->sched_data;
+
+	st = __bfq_entity_update_weight_prio(st, entity);
+	bfq_calc_finish(entity, entity->budget);
+
+	/*
+	 * If some queues enjoy backshifting for a while, then their
+	 * (virtual) finish timestamps may happen to become lower and
+	 * lower than the system virtual time.  In particular, if
+	 * these queues often happen to be idle for short time
+	 * periods, and during such time periods other queues with
+	 * higher timestamps happen to be busy, then the backshifted
+	 * timestamps of the former queues can become much lower than
+	 * the system virtual time. In fact, to serve the queues with
+	 * higher timestamps while the ones with lower timestamps are
+	 * idle, the system virtual time may be pushed-up to much
+	 * higher values than the finish timestamps of the idle
+	 * queues. As a consequence, the finish timestamps of all new
+	 * or newly activated queues may end up being much larger than
+	 * those of lucky queues with backshifted timestamps. The
+	 * latter queues may then monopolize the device for a lot of
+	 * time. This would simply break service guarantees.
+	 *
+	 * To reduce this problem, push up a little bit the
+	 * backshifted timestamps of the queue associated with this
+	 * entity (only a queue can happen to have the backshifted
+	 * flag set): just enough to let the finish timestamp of the
+	 * queue be equal to the current value of the system virtual
+	 * time. This may introduce a little unfairness among queues
+	 * with backshifted timestamps, but it does not break
+	 * worst-case fairness guarantees.
+	 *
+	 * As a special case, if bfqq is weight-raised, push up
+	 * timestamps much less, to keep very low the probability that
+	 * this push up causes the backshifted finish timestamps of
+	 * weight-raised queues to become higher than the backshifted
+	 * finish timestamps of non weight-raised queues.
+	 */
+	if (backshifted && bfq_gt(st->vtime, entity->finish)) {
+		unsigned long delta = st->vtime - entity->finish;
+
+		if (bfqq)
+			delta /= bfqq->wr_coeff;
+
+		entity->start += delta;
+		entity->finish += delta;
+
+		if (bfqq) {
+			bfq_log_bfqq(bfqq->bfqd, bfqq,
+				     "__activate_entity: new queue finish %llu",
+				     ((entity->finish>>10)*1000)>>12);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		} else {
+			struct bfq_group *bfqg =
+				container_of(entity, struct bfq_group, entity);
+
+			bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+				     "__activate_entity: new group finish %llu",
+				     ((entity->finish>>10)*1000)>>12);
+#endif
+		}
+	}
+
+	bfq_active_insert(st, entity);
+
+	if (bfqq) {
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			"__activate_entity: queue %seligible in st %p",
+			     entity->start <= st->vtime ? "" : "non ", st);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	} else {
+		struct bfq_group *bfqg =
+			container_of(entity, struct bfq_group, entity);
+
+		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+			"__activate_entity: group %seligible in st %p",
+			     entity->start <= st->vtime ? "" : "non ", st);
+#endif
+	}
+	BUG_ON(RB_EMPTY_ROOT(&st->active));
+	BUG_ON(&st->active != &sd->service_tree->active &&
+	       &st->active != &(sd->service_tree+1)->active &&
+	       &st->active != &(sd->service_tree+2)->active);
 }
 
 /**
- * __bfq_activate_entity - activate an entity.
+ * __bfq_activate_entity - handle activation of entity.
  * @entity: the entity being activated.
+ * @non_blocking_wait_rq: true if entity was waiting for a request
+ *
+ * Called for a 'true' activation, i.e., if entity is not active and
+ * one of its children receives a new request.
  *
- * Called whenever an entity is activated, i.e., it is not active and one
- * of its children receives a new request, or has to be reactivated due to
- * budget exhaustion.  It uses the current budget of the entity (and the
- * service received if @entity is active) of the queue to calculate its
- * timestamps.
+ * Basically, this function updates the timestamps of entity and
+ * inserts entity into its active tree, ater possible extracting it
+ * from its idle tree.
  */
-static void __bfq_activate_entity(struct bfq_entity *entity)
+static void __bfq_activate_entity(struct bfq_entity *entity,
+				  bool non_blocking_wait_rq)
 {
 	struct bfq_sched_data *sd = entity->sched_data;
 	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	bool backshifted = false;
+	unsigned long long min_vstart;
 
-	if (entity == sd->in_service_entity) {
-		BUG_ON(entity->tree);
-		/*
-		 * If we are requeueing the current entity we have
-		 * to take care of not charging to it service it has
-		 * not received.
-		 */
-		bfq_calc_finish(entity, entity->service);
-		entity->start = entity->finish;
-		sd->in_service_entity = NULL;
-	} else if (entity->tree == &st->active) {
-		/*
-		 * Requeueing an entity due to a change of some
-		 * next_in_service entity below it.  We reuse the
-		 * old start time.
-		 */
-		bfq_active_extract(st, entity);
-	} else if (entity->tree == &st->idle) {
+	BUG_ON(!sd);
+	BUG_ON(!st);
+
+	/* See comments on bfq_fqq_update_budg_for_activation */
+	if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
+		backshifted = true;
+		min_vstart = entity->finish;
+	} else
+		min_vstart = st->vtime;
+
+	if (entity->tree == &st->idle) {
 		/*
 		 * Must be on the idle tree, bfq_idle_extract() will
 		 * check for that.
 		 */
 		bfq_idle_extract(st, entity);
-		entity->start = bfq_gt(st->vtime, entity->finish) ?
-				       st->vtime : entity->finish;
+		entity->start = bfq_gt(min_vstart, entity->finish) ?
+			min_vstart : entity->finish;
 	} else {
 		/*
 		 * The finish time of the entity may be invalid, and
 		 * it is in the past for sure, otherwise the queue
 		 * would have been on the idle tree.
 		 */
-		entity->start = st->vtime;
+		entity->start = min_vstart;
 		st->wsum += entity->weight;
 		bfq_get_entity(entity);
 
-		BUG_ON(entity->on_st);
-		entity->on_st = 1;
+		BUG_ON(entity->on_st && bfqq);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		if (entity->on_st && !bfqq) {
+			struct bfq_group *bfqg =
+				container_of(entity, struct bfq_group,
+					     entity);
+
+			bfq_log_bfqg((struct bfq_data *)bfqg->bfqd,
+				     bfqg,
+				     "activate bug, class %d in_service %p",
+				     bfq_class_idx(entity), sd->in_service_entity);
+		}
+#endif
+		BUG_ON(entity->on_st && !bfqq);
+		entity->on_st = true;
 	}
 
-	st = __bfq_entity_update_weight_prio(st, entity);
-	bfq_calc_finish(entity, entity->budget);
-	bfq_active_insert(st, entity);
+	bfq_update_fin_time_enqueue(entity, st, backshifted);
 }
 
 /**
- * bfq_activate_entity - activate an entity and its ancestors if necessary.
- * @entity: the entity to activate.
+ * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
+ * @entity: the entity being requeued or repositioned.
+ *
+ * Requeueing is needed if this entity stops being served, which
+ * happens if a leaf descendant entity has expired. On the other hand,
+ * repositioning is needed if the next_inservice_entity for the child
+ * entity has changed. See the comments inside the function for
+ * details.
  *
- * Activate @entity and all the entities on the path from it to the root.
+ * Basically, this function: 1) removes entity from its active tree if
+ * present there, 2) updates the timestamps of entity and 3) inserts
+ * entity back into its active tree (in the new, right position for
+ * the new values of the timestamps).
  */
-static void bfq_activate_entity(struct bfq_entity *entity)
+static void __bfq_requeue_entity(struct bfq_entity *entity)
+{
+	struct bfq_sched_data *sd = entity->sched_data;
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+	BUG_ON(!sd);
+	BUG_ON(!st);
+
+	BUG_ON(entity != sd->in_service_entity &&
+	       entity->tree != &st->active);
+
+	if (entity == sd->in_service_entity) {
+		/*
+		 * We are requeueing the current in-service entity,
+		 * which may have to be done for one of the following
+		 * reasons:
+		 * - entity represents the in-service queue, and the
+		 *   in-service queue is being requeued after an
+		 *   expiration;
+		 * - entity represents a group, and its budget has
+		 *   changed because one of its child entities has
+		 *   just been either activated or requeued for some
+		 *   reason; the timestamps of the entity need then to
+		 *   be updated, and the entity needs to be enqueued
+		 *   or repositioned accordingly.
+		 *
+		 * In particular, before requeueing, the start time of
+		 * the entity must be moved forward to account for the
+		 * service that the entity has received while in
+		 * service. This is done by the next instructions. The
+		 * finish time will then be updated according to this
+		 * new value of the start time, and to the budget of
+		 * the entity.
+		 */
+		bfq_calc_finish(entity, entity->service);
+		entity->start = entity->finish;
+		BUG_ON(entity->tree && entity->tree != &st->active);
+		/*
+		 * In addition, if the entity had more than one child
+		 * when set in service, then was not extracted from
+		 * the active tree. This implies that the position of
+		 * the entity in the active tree may need to be
+		 * changed now, because we have just updated the start
+		 * time of the entity, and we will update its finish
+		 * time in a moment (the requeueing is then, more
+		 * precisely, a repositioning in this case). To
+		 * implement this repositioning, we: 1) dequeue the
+		 * entity here, 2) update the finish time and
+		 * requeue the entity according to the new
+		 * timestamps below.
+		 */
+		if (entity->tree)
+			bfq_active_extract(st, entity);
+	} else { /* The entity is already active, and not in service */
+		/*
+		 * In this case, this function gets called only if the
+		 * next_in_service entity below this entity has
+		 * changed, and this change has caused the budget of
+		 * this entity to change, which, finally implies that
+		 * the finish time of this entity must be
+		 * updated. Such an update may cause the scheduling,
+		 * i.e., the position in the active tree, of this
+		 * entity to change. We handle this change by: 1)
+		 * dequeueing the entity here, 2) updating the finish
+		 * time and requeueing the entity according to the new
+		 * timestamps below. This is the same approach as the
+		 * non-extracted-entity sub-case above.
+		 */
+		bfq_active_extract(st, entity);
+	}
+
+	bfq_update_fin_time_enqueue(entity, st, false);
+}
+
+static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
+					  struct bfq_sched_data *sd,
+					  bool non_blocking_wait_rq)
+{
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+	if (sd->in_service_entity == entity || entity->tree == &st->active)
+		 /*
+		  * in service or already queued on the active tree,
+		  * requeue or reposition
+		  */
+		__bfq_requeue_entity(entity);
+	else
+		/*
+		 * Not in service and not queued on its active tree:
+		 * the activity is idle and this is a true activation.
+		 */
+		__bfq_activate_entity(entity, non_blocking_wait_rq);
+}
+
+
+/**
+ * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
+ *			 and activate, requeue or reposition all ancestors
+ *			 for which such an update becomes necessary.
+ * @entity: the entity to activate.
+ * @non_blocking_wait_rq: true if this entity was waiting for a request
+ * @requeue: true if this is a requeue, which implies that bfqq is
+ *	     being expired; thus ALL its ancestors stop being served and must
+ *	     therefore be requeued
+ */
+static void bfq_activate_requeue_entity(struct bfq_entity *entity,
+					bool non_blocking_wait_rq,
+					bool requeue)
 {
 	struct bfq_sched_data *sd;
 
 	for_each_entity(entity) {
-		__bfq_activate_entity(entity);
-
+		BUG_ON(!entity);
 		sd = entity->sched_data;
-		if (!bfq_update_next_in_service(sd))
-			/*
-			 * No need to propagate the activation to the
-			 * upper entities, as they will be updated when
-			 * the in-service entity is rescheduled.
-			 */
+		__bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
+
+		BUG_ON(RB_EMPTY_ROOT(&sd->service_tree->active) &&
+		       RB_EMPTY_ROOT(&(sd->service_tree+1)->active) &&
+		       RB_EMPTY_ROOT(&(sd->service_tree+2)->active));
+
+		if (!bfq_update_next_in_service(sd, entity) && !requeue) {
+			BUG_ON(!sd->next_in_service);
 			break;
+		}
+		BUG_ON(!sd->next_in_service);
 	}
 }
 
 /**
  * __bfq_deactivate_entity - deactivate an entity from its service tree.
  * @entity: the entity to deactivate.
- * @requeue: if false, the entity will not be put into the idle tree.
- *
- * Deactivate an entity, independently from its previous state.  If the
- * entity was not on a service tree just return, otherwise if it is on
- * any scheduler tree, extract it from that tree, and if necessary
- * and if the caller did not specify @requeue, put it on the idle tree.
+ * @ins_into_idle_tree: if false, the entity will not be put into the
+ *			idle tree.
  *
- * Return %1 if the caller should update the entity hierarchy, i.e.,
- * if the entity was in service or if it was the next_in_service for
- * its sched_data; return %0 otherwise.
+ * Deactivates an entity, independently from its previous state.  Must
+ * be invoked only if entity is on a service tree. Extracts the entity
+ * from that tree, and if necessary and allowed, puts it on the idle
+ * tree.
  */
-static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
+static bool __bfq_deactivate_entity(struct bfq_entity *entity,
+				    bool ins_into_idle_tree)
 {
 	struct bfq_sched_data *sd = entity->sched_data;
-	struct bfq_service_tree *st;
-	int was_in_service;
-	int ret = 0;
-
-	if (sd == NULL || !entity->on_st) /* never activated, or inactive */
-		return 0;
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+	bool was_in_service = entity == sd->in_service_entity;
 
-	st = bfq_entity_service_tree(entity);
-	was_in_service = entity == sd->in_service_entity;
+	if (!entity->on_st) { /* entity never activated, or already inactive */
+		BUG_ON(entity == entity->sched_data->in_service_entity);
+		return false;
+	}
 
-	BUG_ON(was_in_service && entity->tree);
+	BUG_ON(was_in_service && entity->tree && entity->tree != &st->active);
 
-	if (was_in_service) {
+	if (was_in_service)
 		bfq_calc_finish(entity, entity->service);
-		sd->in_service_entity = NULL;
-	} else if (entity->tree == &st->active)
+
+	if (entity->tree == &st->active)
 		bfq_active_extract(st, entity);
-	else if (entity->tree == &st->idle)
+	else if (!was_in_service && entity->tree == &st->idle)
 		bfq_idle_extract(st, entity);
 	else if (entity->tree)
 		BUG();
 
-	if (was_in_service || sd->next_in_service == entity)
-		ret = bfq_update_next_in_service(sd);
-
-	if (!requeue || !bfq_gt(entity->finish, st->vtime))
+	if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime))
 		bfq_forget_entity(st, entity);
 	else
 		bfq_idle_insert(st, entity);
 
-	BUG_ON(sd->in_service_entity == entity);
-	BUG_ON(sd->next_in_service == entity);
-
-	return ret;
+	return true;
 }
 
 /**
- * bfq_deactivate_entity - deactivate an entity.
+ * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
  * @entity: the entity to deactivate.
- * @requeue: true if the entity can be put on the idle tree
+ * @ins_into_idle_tree: true if the entity can be put on the idle tree
  */
-static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
+static void bfq_deactivate_entity(struct bfq_entity *entity,
+				  bool ins_into_idle_tree,
+				  bool expiration)
 {
 	struct bfq_sched_data *sd;
 	struct bfq_entity *parent;
@@ -887,63 +1306,154 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
 	for_each_entity_safe(entity, parent) {
 		sd = entity->sched_data;
 
-		if (!__bfq_deactivate_entity(entity, requeue))
+		BUG_ON(sd == NULL); /*
+				     * It would mean that this is the
+				     * root group.
+				     */
+
+		BUG_ON(expiration && entity != sd->in_service_entity);
+
+		BUG_ON(entity != sd->in_service_entity &&
+		       entity->tree ==
+		       &bfq_entity_service_tree(entity)->active &&
+		       !sd->next_in_service);
+
+		if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
 			/*
-			 * The parent entity is still backlogged, and
-			 * we don't need to update it as it is still
-			 * in service.
+			 * Entity is not any tree any more, so, this
+			 * deactivation is a no-op, and there is
+			 * nothing to change for upper-level entities
+			 * (in case of expiration, this can never
+			 * happen).
 			 */
-			break;
+			BUG_ON(expiration); /*
+					     * entity cannot be already out of
+					     * any tree
+					     */
+			return;
+		}
 
-		if (sd->next_in_service)
+		if (sd->next_in_service == entity)
 			/*
-			 * The parent entity is still backlogged and
-			 * the budgets on the path towards the root
-			 * need to be updated.
+			 * entity was the next_in_service entity,
+			 * then, since entity has just been
+			 * deactivated, a new one must be found.
 			 */
-			goto update;
+			bfq_update_next_in_service(sd, NULL);
+
+		if (sd->next_in_service) {
+			/*
+			 * The parent entity is still backlogged,
+			 * because next_in_service is not NULL. So, no
+			 * further upwards deactivation must be
+			 * performed.  Yet, next_in_service has
+			 * changed.  Then the schedule does need to be
+			 * updated upwards.
+			 */
+			BUG_ON(sd->next_in_service == entity);
+			break;
+		}
 
 		/*
-		 * If we reach there the parent is no more backlogged and
-		 * we want to propagate the dequeue upwards.
+		 * If we get here, then the parent is no more
+		 * backlogged and we need to propagate the
+		 * deactivation upwards. Thus let the loop go on.
 		 */
-		requeue = 1;
-	}
 
-	return;
+		/*
+		 * Also let parent be queued into the idle tree on
+		 * deactivation, to preserve service guarantees, and
+		 * assuming that who invoked this function does not
+		 * need parent entities too to be removed completely.
+		 */
+		ins_into_idle_tree = true;
+	}
 
-update:
+	/*
+	 * If the deactivation loop is fully executed, then there are
+	 * no more entities to touch and next loop is not executed at
+	 * all. Otherwise, requeue remaining entities if they are
+	 * about to stop receiving service, or reposition them if this
+	 * is not the case.
+	 */
 	entity = parent;
 	for_each_entity(entity) {
-		__bfq_activate_entity(entity);
+		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+		/*
+		 * Invoke __bfq_requeue_entity on entity, even if
+		 * already active, to requeue/reposition it in the
+		 * active tree (because sd->next_in_service has
+		 * changed)
+		 */
+		__bfq_requeue_entity(entity);
 
 		sd = entity->sched_data;
-		if (!bfq_update_next_in_service(sd))
+		BUG_ON(expiration && sd->in_service_entity != entity);
+
+		if (bfqq)
+			bfq_log_bfqq(bfqq->bfqd, bfqq,
+				     "invoking udpdate_next for this queue");
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		else {
+			struct bfq_group *bfqg =
+				container_of(entity,
+					     struct bfq_group, entity);
+
+			bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+				     "invoking udpdate_next for this entity");
+		}
+#endif
+		if (!bfq_update_next_in_service(sd, entity) &&
+		    !expiration)
+			/*
+			 * next_in_service unchanged or not causing
+			 * any change in entity->parent->sd, and no
+			 * requeueing needed for expiration: stop
+			 * here.
+			 */
 			break;
 	}
 }
 
 /**
- * bfq_update_vtime - update vtime if necessary.
+ * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
+ *                       if needed, to have at least one entity eligible.
  * @st: the service tree to act upon.
  *
- * If necessary update the service tree vtime to have at least one
- * eligible entity, skipping to its start time.  Assumes that the
- * active tree of the device is not empty.
- *
- * NOTE: this hierarchical implementation updates vtimes quite often,
- * we may end up with reactivated processes getting timestamps after a
- * vtime skip done because we needed a ->first_active entity on some
- * intermediate node.
+ * Assumes that st is not empty.
  */
-static void bfq_update_vtime(struct bfq_service_tree *st)
+static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
 {
-	struct bfq_entity *entry;
-	struct rb_node *node = st->active.rb_node;
+	struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
+
+	if (bfq_gt(root_entity->min_start, st->vtime)) {
+		struct bfq_queue *bfqq = bfq_entity_to_bfqq(root_entity);
 
-	entry = rb_entry(node, struct bfq_entity, rb_node);
-	if (bfq_gt(entry->min_start, st->vtime)) {
-		st->vtime = entry->min_start;
+		if (bfqq)
+			bfq_log_bfqq(bfqq->bfqd, bfqq,
+				     "calc_vtime_jump: new value %llu",
+				     root_entity->min_start);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		else {
+			struct bfq_group *bfqg =
+				container_of(root_entity, struct bfq_group,
+					     entity);
+
+			bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+				     "calc_vtime_jump: new value %llu",
+				     root_entity->min_start);
+		}
+#endif
+		return root_entity->min_start;
+	}
+	return st->vtime;
+}
+
+static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
+{
+	if (new_value > st->vtime) {
+		st->vtime = new_value;
 		bfq_forget_idle(st);
 	}
 }
@@ -952,6 +1462,7 @@ static void bfq_update_vtime(struct bfq_service_tree *st)
  * bfq_first_active_entity - find the eligible entity with
  *                           the smallest finish time
  * @st: the service tree to select from.
+ * @vtime: the system virtual to use as a reference for eligibility
  *
  * This function searches the first schedulable entity, starting from the
  * root of the tree and going on the left every time on this side there is
@@ -959,7 +1470,8 @@ static void bfq_update_vtime(struct bfq_service_tree *st)
  * the right is followed only if a) the left subtree contains no eligible
  * entities and b) no eligible entity has been found yet.
  */
-static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st,
+						  u64 vtime)
 {
 	struct bfq_entity *entry, *first = NULL;
 	struct rb_node *node = st->active.rb_node;
@@ -967,15 +1479,15 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
 	while (node) {
 		entry = rb_entry(node, struct bfq_entity, rb_node);
 left:
-		if (!bfq_gt(entry->start, st->vtime))
+		if (!bfq_gt(entry->start, vtime))
 			first = entry;
 
-		BUG_ON(bfq_gt(entry->min_start, st->vtime));
+		BUG_ON(bfq_gt(entry->min_start, vtime));
 
 		if (node->rb_left) {
 			entry = rb_entry(node->rb_left,
 					 struct bfq_entity, rb_node);
-			if (!bfq_gt(entry->min_start, st->vtime)) {
+			if (!bfq_gt(entry->min_start, vtime)) {
 				node = node->rb_left;
 				goto left;
 			}
@@ -993,31 +1505,84 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
  * __bfq_lookup_next_entity - return the first eligible entity in @st.
  * @st: the service tree.
  *
- * Update the virtual time in @st and return the first eligible entity
- * it contains.
+ * If there is no in-service entity for the sched_data st belongs to,
+ * then return the entity that will be set in service if:
+ * 1) the parent entity this st belongs to is set in service;
+ * 2) no entity belonging to such parent entity undergoes a state change
+ * that would influence the timestamps of the entity (e.g., becomes idle,
+ * becomes backlogged, changes its budget, ...).
+ *
+ * In this first case, update the virtual time in @st too (see the
+ * comments on this update inside the function).
+ *
+ * In constrast, if there is an in-service entity, then return the
+ * entity that would be set in service if not only the above
+ * conditions, but also the next one held true: the currently
+ * in-service entity, on expiration,
+ * 1) gets a finish time equal to the current one, or
+ * 2) is not eligible any more, or
+ * 3) is idle.
  */
-static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
-						   bool force)
+static struct bfq_entity *
+__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service
+#if 0
+			 , bool force
+#endif
+	)
 {
-	struct bfq_entity *entity, *new_next_in_service = NULL;
+	struct bfq_entity *entity
+#if 0
+		, *new_next_in_service = NULL
+#endif
+		;
+	u64 new_vtime;
+	struct bfq_queue *bfqq;
 
 	if (RB_EMPTY_ROOT(&st->active))
 		return NULL;
 
-	bfq_update_vtime(st);
-	entity = bfq_first_active_entity(st);
-	BUG_ON(bfq_gt(entity->start, st->vtime));
+	/*
+	 * Get the value of the system virtual time for which at
+	 * least one entity is eligible.
+	 */
+	new_vtime = bfq_calc_vtime_jump(st);
 
 	/*
-	 * If the chosen entity does not match with the sched_data's
-	 * next_in_service and we are forcedly serving the IDLE priority
-	 * class tree, bubble up budget update.
+	 * If there is no in-service entity for the sched_data this
+	 * active tree belongs to, then push the system virtual time
+	 * up to the value that guarantees that at least one entity is
+	 * eligible. If, instead, there is an in-service entity, then
+	 * do not make any such update, because there is already an
+	 * eligible entity, namely the in-service one (even if the
+	 * entity is not on st, because it was extracted when set in
+	 * service).
 	 */
-	if (unlikely(force && entity != entity->sched_data->next_in_service)) {
-		new_next_in_service = entity;
-		for_each_entity(new_next_in_service)
-			bfq_update_budget(new_next_in_service);
+	if (!in_service)
+		bfq_update_vtime(st, new_vtime);
+
+	entity = bfq_first_active_entity(st, new_vtime);
+	BUG_ON(bfq_gt(entity->start, new_vtime));
+
+	/* Log some information */
+	bfqq = bfq_entity_to_bfqq(entity);
+	if (bfqq)
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			     "__lookup_next: start %llu vtime %llu st %p",
+			     ((entity->start>>10)*1000)>>12,
+			     ((new_vtime>>10)*1000)>>12, st);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	else {
+		struct bfq_group *bfqg =
+			container_of(entity, struct bfq_group, entity);
+
+		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+			     "__lookup_next: start %llu vtime %llu st %p",
+			     ((entity->start>>10)*1000)>>12,
+			     ((new_vtime>>10)*1000)>>12, st);
 	}
+#endif
+
+	BUG_ON(!entity);
 
 	return entity;
 }
@@ -1025,50 +1590,81 @@ static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
 /**
  * bfq_lookup_next_entity - return the first eligible entity in @sd.
  * @sd: the sched_data.
- * @extract: if true the returned entity will be also extracted from @sd.
  *
- * NOTE: since we cache the next_in_service entity at each level of the
- * hierarchy, the complexity of the lookup can be decreased with
- * absolutely no effort just returning the cached next_in_service value;
- * we prefer to do full lookups to test the consistency of * the data
- * structures.
+ * This function is invoked when there has been a change in the trees
+ * for sd, and we need know what is the new next entity after this
+ * change.
  */
-static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
-						 int extract,
-						 struct bfq_data *bfqd)
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd)
 {
 	struct bfq_service_tree *st = sd->service_tree;
-	struct bfq_entity *entity;
-	int i = 0;
-
-	BUG_ON(sd->in_service_entity);
+	struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
+	struct bfq_entity *entity = NULL;
+	struct bfq_queue *bfqq;
+	int class_idx = 0;
 
-	if (bfqd &&
-	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
-		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
-						  true);
-		if (entity) {
-			i = BFQ_IOPRIO_CLASSES - 1;
-			bfqd->bfq_class_idle_last_service = jiffies;
-			sd->next_in_service = entity;
-		}
+	BUG_ON(!sd);
+	BUG_ON(!st);
+	/*
+	 * Choose from idle class, if needed to guarantee a minimum
+	 * bandwidth to this class (and if there is some active entity
+	 * in idle class). This should also mitigate
+	 * priority-inversion problems in case a low priority task is
+	 * holding file system resources.
+	 */
+	if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
+				   BFQ_CL_IDLE_TIMEOUT)) {
+		if (!RB_EMPTY_ROOT(&idle_class_st->active))
+			class_idx = BFQ_IOPRIO_CLASSES - 1;
+		/* About to be served if backlogged, or not yet backlogged */
+		sd->bfq_class_idle_last_service = jiffies;
 	}
-	for (; i < BFQ_IOPRIO_CLASSES; i++) {
-		entity = __bfq_lookup_next_entity(st + i, false);
-		if (entity) {
-			if (extract) {
-				bfq_check_next_in_service(sd, entity);
-				bfq_active_extract(st + i, entity);
-				sd->in_service_entity = entity;
-				sd->next_in_service = NULL;
-			}
+
+	/*
+	 * Find the next entity to serve for the highest-priority
+	 * class, unless the idle class needs to be served.
+	 */
+	for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
+		entity = __bfq_lookup_next_entity(st + class_idx,
+						  sd->in_service_entity);
+
+		if (entity)
 			break;
-		}
 	}
 
+	BUG_ON(!entity &&
+	       (!RB_EMPTY_ROOT(&st->active) || !RB_EMPTY_ROOT(&(st+1)->active) ||
+		!RB_EMPTY_ROOT(&(st+2)->active)));
+
+	if (!entity)
+		return NULL;
+
+	/* Log some information */
+	bfqq = bfq_entity_to_bfqq(entity);
+	if (bfqq)
+		bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d",
+			     st + class_idx, class_idx);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	else {
+		struct bfq_group *bfqg =
+			container_of(entity, struct bfq_group, entity);
+
+		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+			     "chosen from st %p %d",
+			     st + class_idx, class_idx);
+	}
+#endif
+
 	return entity;
 }
 
+static bool next_queue_may_preempt(struct bfq_data *bfqd)
+{
+	struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
+
+	return sd->next_in_service != sd->in_service_entity;
+}
+
 /*
  * Get next queue for service.
  */
@@ -1083,58 +1679,208 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
 	if (bfqd->busy_queues == 0)
 		return NULL;
 
+	/*
+	 * Traverse the path from the root to the leaf entity to
+	 * serve. Set in service all the entities visited along the
+	 * way.
+	 */
 	sd = &bfqd->root_group->sched_data;
 	for (; sd ; sd = entity->my_sched_data) {
-		entity = bfq_lookup_next_entity(sd, 1, bfqd);
-		BUG_ON(!entity);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		if (entity) {
+			struct bfq_group *bfqg =
+				container_of(entity, struct bfq_group, entity);
+
+			bfq_log_bfqg(bfqd, bfqg,
+				     "get_next_queue: lookup in this group");
+			if (!sd->next_in_service)
+				pr_crit("get_next_queue: lookup in this group");
+		} else {
+			bfq_log_bfqg(bfqd, bfqd->root_group,
+				     "get_next_queue: lookup in root group");
+			if (!sd->next_in_service)
+				pr_crit("get_next_queue: lookup in root group");
+		}
+#endif
+
+		BUG_ON(!sd->next_in_service);
+
+		/*
+		 * WARNING. We are about to set the in-service entity
+		 * to sd->next_in_service, i.e., to the (cached) value
+		 * returned by bfq_lookup_next_entity(sd) the last
+		 * time it was invoked, i.e., the last time when the
+		 * service order in sd changed as a consequence of the
+		 * activation or deactivation of an entity. In this
+		 * respect, if we execute bfq_lookup_next_entity(sd)
+		 * in this very moment, it may, although with low
+		 * probability, yield a different entity than that
+		 * pointed to by sd->next_in_service. This rare event
+		 * happens in case there was no CLASS_IDLE entity to
+		 * serve for sd when bfq_lookup_next_entity(sd) was
+		 * invoked for the last time, while there is now one
+		 * such entity.
+		 *
+		 * If the above event happens, then the scheduling of
+		 * such entity in CLASS_IDLE is postponed until the
+		 * service of the sd->next_in_service entity
+		 * finishes. In fact, when the latter is expired,
+		 * bfq_lookup_next_entity(sd) gets called again,
+		 * exactly to update sd->next_in_service.
+		 */
+
+		/* Make next_in_service entity become in_service_entity */
+		entity = sd->next_in_service;
+		sd->in_service_entity = entity;
+
+		/*
+		 * Reset the accumulator of the amount of service that
+		 * the entity is about to receive.
+		 */
 		entity->service = 0;
+
+		/*
+		 * If entity is no longer a candidate for next
+		 * service, then we extract it from its active tree,
+		 * for the following reason. To further boost the
+		 * throughput in some special case, BFQ needs to know
+		 * which is the next candidate entity to serve, while
+		 * there is already an entity in service. In this
+		 * respect, to make it easy to compute/update the next
+		 * candidate entity to serve after the current
+		 * candidate has been set in service, there is a case
+		 * where it is necessary to extract the current
+		 * candidate from its service tree. Such a case is
+		 * when the entity just set in service cannot be also
+		 * a candidate for next service. Details about when
+		 * this conditions holds are reported in the comments
+		 * on the function bfq_no_longer_next_in_service()
+		 * invoked below.
+		 */
+		if (bfq_no_longer_next_in_service(entity))
+			bfq_active_extract(bfq_entity_service_tree(entity),
+					   entity);
+
+		/*
+		 * For the same reason why we may have just extracted
+		 * entity from its active tree, we may need to update
+		 * next_in_service for the sched_data of entity too,
+		 * regardless of whether entity has been extracted.
+		 * In fact, even if entity has not been extracted, a
+		 * descendant entity may get extracted. Such an event
+		 * would cause a change in next_in_service for the
+		 * level of the descendant entity, and thus possibly
+		 * back to upper levels.
+		 *
+		 * We cannot perform the resulting needed update
+		 * before the end of this loop, because, to know which
+		 * is the correct next-to-serve candidate entity for
+		 * each level, we need first to find the leaf entity
+		 * to set in service. In fact, only after we know
+		 * which is the next-to-serve leaf entity, we can
+		 * discover whether the parent entity of the leaf
+		 * entity becomes the next-to-serve, and so on.
+		 */
+
+		/* Log some information */
+		bfqq = bfq_entity_to_bfqq(entity);
+		if (bfqq)
+			bfq_log_bfqq(bfqd, bfqq,
+			     "get_next_queue: this queue, finish %llu",
+				(((entity->finish>>10)*1000)>>10)>>2);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		else {
+			struct bfq_group *bfqg =
+				container_of(entity, struct bfq_group, entity);
+
+			bfq_log_bfqg(bfqd, bfqg,
+			     "get_next_queue: this entity, finish %llu",
+				(((entity->finish>>10)*1000)>>10)>>2);
+		}
+#endif
+
 	}
 
+	BUG_ON(!entity);
 	bfqq = bfq_entity_to_bfqq(entity);
 	BUG_ON(!bfqq);
 
+	/*
+	 * We can finally update all next-to-serve entities along the
+	 * path from the leaf entity just set in service to the root.
+	 */
+	for_each_entity(entity) {
+		struct bfq_sched_data *sd = entity->sched_data;
+
+		if(!bfq_update_next_in_service(sd, NULL))
+			break;
+	}
+
 	return bfqq;
 }
 
 static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
 {
+	struct bfq_entity *entity = &bfqd->in_service_queue->entity;
+
 	if (bfqd->in_service_bic) {
 		put_io_context(bfqd->in_service_bic->icq.ioc);
 		bfqd->in_service_bic = NULL;
 	}
 
+	bfq_clear_bfqq_wait_request(bfqd->in_service_queue);
+	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
 	bfqd->in_service_queue = NULL;
-	del_timer(&bfqd->idle_slice_timer);
+
+	/*
+	 * When this function is called, all in-service entities have
+	 * been properly deactivated or requeued, so we can safely
+	 * execute the final step: reset in_service_entity along the
+	 * path from entity to the root.
+	 */
+	for_each_entity(entity)
+		entity->sched_data->in_service_entity = NULL;
 }
 
 static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-				int requeue)
+				bool ins_into_idle_tree, bool expiration)
 {
 	struct bfq_entity *entity = &bfqq->entity;
 
-	if (bfqq == bfqd->in_service_queue)
-		__bfq_bfqd_reset_in_service(bfqd);
-
-	bfq_deactivate_entity(entity, requeue);
+	bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
 }
 
 static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 {
 	struct bfq_entity *entity = &bfqq->entity;
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+	BUG_ON(bfqq == bfqd->in_service_queue);
+	BUG_ON(entity->tree != &st->active && entity->tree != &st->idle &&
+	       entity->on_st);
 
-	bfq_activate_entity(entity);
+	bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
+				    false);
+	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
+}
+
+static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_activate_requeue_entity(entity, false,
+				    bfqq == bfqd->in_service_queue);
 }
 
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
 static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
-#endif
 
 /*
  * Called when the bfqq no longer has requests pending, remove it from
- * the service tree.
+ * the service tree. As a special case, it can be invoked during an
+ * expiration.
  */
 static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			      int requeue)
+			      bool expiration)
 {
 	BUG_ON(!bfq_bfqq_busy(bfqq));
 	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
@@ -1146,27 +1892,20 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	BUG_ON(bfqd->busy_queues == 0);
 	bfqd->busy_queues--;
 
-	if (!bfqq->dispatched) {
+	if (!bfqq->dispatched)
 		bfq_weights_tree_remove(bfqd, &bfqq->entity,
 					&bfqd->queue_weights_tree);
-		if (!blk_queue_nonrot(bfqd->queue)) {
-			BUG_ON(!bfqd->busy_in_flight_queues);
-			bfqd->busy_in_flight_queues--;
-			if (bfq_bfqq_constantly_seeky(bfqq)) {
-				BUG_ON(!bfqd->
-					const_seeky_busy_in_flight_queues);
-				bfqd->const_seeky_busy_in_flight_queues--;
-			}
-		}
-	}
+
 	if (bfqq->wr_coeff > 1)
 		bfqd->wr_busy_queues--;
 
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
 	bfqg_stats_update_dequeue(bfqq_group(bfqq));
-#endif
 
-	bfq_deactivate_bfqq(bfqd, bfqq, requeue);
+	BUG_ON(bfqq->entity.budget < 0);
+
+	bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
+
+	BUG_ON(bfqq->entity.budget < 0);
 }
 
 /*
@@ -1184,16 +1923,11 @@ static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 	bfq_mark_bfqq_busy(bfqq);
 	bfqd->busy_queues++;
 
-	if (!bfqq->dispatched) {
+	if (!bfqq->dispatched)
 		if (bfqq->wr_coeff == 1)
 			bfq_weights_tree_add(bfqd, &bfqq->entity,
 					     &bfqd->queue_weights_tree);
-		if (!blk_queue_nonrot(bfqd->queue)) {
-			bfqd->busy_in_flight_queues++;
-			if (bfq_bfqq_constantly_seeky(bfqq))
-				bfqd->const_seeky_busy_in_flight_queues++;
-		}
-	}
+
 	if (bfqq->wr_coeff > 1)
 		bfqd->wr_busy_queues++;
 }
diff --git a/block/bfq.h b/block/bfq.h
index fcce855..bef8244 100644
--- a/block/bfq.h
+++ b/block/bfq.h
@@ -1,5 +1,5 @@
 /*
- * BFQ-v7r11 for 4.5.0: data structures and common functions prototypes.
+ * BFQ v8r7 for 4.9.0: data structures and common functions prototypes.
  *
  * Based on ideas and code from CFQ:
  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
@@ -7,7 +7,9 @@
  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
  *		      Paolo Valente <paolo.valente@unimore.it>
  *
- * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org>
  */
 
 #ifndef _BFQ_H
@@ -28,20 +30,21 @@
 
 #define BFQ_DEFAULT_QUEUE_IOPRIO	4
 
-#define BFQ_DEFAULT_GRP_WEIGHT	10
+#define BFQ_WEIGHT_LEGACY_DFL	100
 #define BFQ_DEFAULT_GRP_IOPRIO	0
 #define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE
 
+/*
+ * Soft real-time applications are extremely more latency sensitive
+ * than interactive ones. Over-raise the weight of the former to
+ * privilege them against the latter.
+ */
+#define BFQ_SOFTRT_WEIGHT_FACTOR	100
+
 struct bfq_entity;
 
 /**
  * struct bfq_service_tree - per ioprio_class service tree.
- * @active: tree for active entities (i.e., those backlogged).
- * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
- * @first_idle: idle entity with minimum F_i.
- * @last_idle: idle entity with maximum F_i.
- * @vtime: scheduler virtual time.
- * @wsum: scheduler weight sum; active and idle entities contribute to it.
  *
  * Each service tree represents a B-WF2Q+ scheduler on its own.  Each
  * ioprio_class has its own independent scheduler, and so its own
@@ -49,27 +52,28 @@ struct bfq_entity;
  * of the containing bfqd.
  */
 struct bfq_service_tree {
+	/* tree for active entities (i.e., those backlogged) */
 	struct rb_root active;
+	/* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
 	struct rb_root idle;
 
-	struct bfq_entity *first_idle;
-	struct bfq_entity *last_idle;
+	struct bfq_entity *first_idle;	/* idle entity with minimum F_i */
+	struct bfq_entity *last_idle;	/* idle entity with maximum F_i */
 
-	u64 vtime;
+	u64 vtime; /* scheduler virtual time */
+	/* scheduler weight sum; active and idle entities contribute to it */
 	unsigned long wsum;
 };
 
 /**
  * struct bfq_sched_data - multi-class scheduler.
- * @in_service_entity: entity in service.
- * @next_in_service: head-of-the-line entity in the scheduler.
- * @service_tree: array of service trees, one per ioprio_class.
  *
  * bfq_sched_data is the basic scheduler queue.  It supports three
- * ioprio_classes, and can be used either as a toplevel queue or as
- * an intermediate queue on a hierarchical setup.
- * @next_in_service points to the active entity of the sched_data
- * service trees that will be scheduled next.
+ * ioprio_classes, and can be used either as a toplevel queue or as an
+ * intermediate queue on a hierarchical setup.  @next_in_service
+ * points to the active entity of the sched_data service trees that
+ * will be scheduled next. It is used to reduce the number of steps
+ * needed for each hierarchical-schedule update.
  *
  * The supported ioprio_classes are the same as in CFQ, in descending
  * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
@@ -79,48 +83,32 @@ struct bfq_service_tree {
  * All the fields are protected by the queue lock of the containing bfqd.
  */
 struct bfq_sched_data {
-	struct bfq_entity *in_service_entity;
+	struct bfq_entity *in_service_entity;  /* entity in service */
+	/* head-of-the-line entity in the scheduler (see comments above) */
 	struct bfq_entity *next_in_service;
+	/* array of service trees, one per ioprio_class */
 	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
+	/* last time CLASS_IDLE was served */
+	unsigned long bfq_class_idle_last_service;
+
 };
 
 /**
  * struct bfq_weight_counter - counter of the number of all active entities
  *                             with a given weight.
- * @weight: weight of the entities that this counter refers to.
- * @num_active: number of active entities with this weight.
- * @weights_node: weights tree member (see bfq_data's @queue_weights_tree
- *                and @group_weights_tree).
  */
 struct bfq_weight_counter {
-	short int weight;
-	unsigned int num_active;
+	unsigned int weight; /* weight of the entities this counter refers to */
+	unsigned int num_active; /* nr of active entities with this weight */
+	/*
+	 * Weights tree member (see bfq_data's @queue_weights_tree and
+	 * @group_weights_tree)
+	 */
 	struct rb_node weights_node;
 };
 
 /**
  * struct bfq_entity - schedulable entity.
- * @rb_node: service_tree member.
- * @weight_counter: pointer to the weight counter associated with this entity.
- * @on_st: flag, true if the entity is on a tree (either the active or
- *         the idle one of its service_tree).
- * @finish: B-WF2Q+ finish timestamp (aka F_i).
- * @start: B-WF2Q+ start timestamp (aka S_i).
- * @tree: tree the entity is enqueued into; %NULL if not on a tree.
- * @min_start: minimum start time of the (active) subtree rooted at
- *             this entity; used for O(log N) lookups into active trees.
- * @service: service received during the last round of service.
- * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
- * @weight: weight of the queue
- * @parent: parent entity, for hierarchical scheduling.
- * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
- *                 associated scheduler queue, %NULL on leaf nodes.
- * @sched_data: the scheduler queue this entity belongs to.
- * @ioprio: the ioprio in use.
- * @new_weight: when a weight change is requested, the new weight value.
- * @orig_weight: original weight, used to implement weight boosting
- * @prio_changed: flag, true when the user requested a weight, ioprio or
- *		  ioprio_class change.
  *
  * A bfq_entity is used to represent either a bfq_queue (leaf node in the
  * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each
@@ -147,27 +135,52 @@ struct bfq_weight_counter {
  * containing bfqd.
  */
 struct bfq_entity {
-	struct rb_node rb_node;
+	struct rb_node rb_node; /* service_tree member */
+	/* pointer to the weight counter associated with this entity */
 	struct bfq_weight_counter *weight_counter;
 
-	int on_st;
+	/*
+	 * Flag, true if the entity is on a tree (either the active or
+	 * the idle one of its service_tree) or is in service.
+	 */
+	bool on_st;
 
-	u64 finish;
-	u64 start;
+	u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */
+	u64 start;  /* B-WF2Q+ start timestamp (aka S_i) */
 
+	/* tree the entity is enqueued into; %NULL if not on a tree */
 	struct rb_root *tree;
 
+	/*
+	 * minimum start time of the (active) subtree rooted at this
+	 * entity; used for O(log N) lookups into active trees
+	 */
 	u64 min_start;
 
-	int service, budget;
-	unsigned short weight, new_weight;
-	unsigned short orig_weight;
+	/* amount of service received during the last service slot */
+	int service;
+
+	/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
+	int budget;
+
+	unsigned int weight;	 /* weight of the queue */
+	unsigned int new_weight; /* next weight if a change is in progress */
+
+	/* original weight, used to implement weight boosting */
+	unsigned int orig_weight;
 
+	/* parent entity, for hierarchical scheduling */
 	struct bfq_entity *parent;
 
+	/*
+	 * For non-leaf nodes in the hierarchy, the associated
+	 * scheduler queue, %NULL on leaf nodes.
+	 */
 	struct bfq_sched_data *my_sched_data;
+	/* the scheduler queue this entity belongs to */
 	struct bfq_sched_data *sched_data;
 
+	/* flag, set to request a weight, ioprio or ioprio_class change  */
 	int prio_changed;
 };
 
@@ -175,56 +188,6 @@ struct bfq_group;
 
 /**
  * struct bfq_queue - leaf schedulable entity.
- * @ref: reference counter.
- * @bfqd: parent bfq_data.
- * @new_ioprio: when an ioprio change is requested, the new ioprio value.
- * @ioprio_class: the ioprio_class in use.
- * @new_ioprio_class: when an ioprio_class change is requested, the new
- *                    ioprio_class value.
- * @new_bfqq: shared bfq_queue if queue is cooperating with
- *           one or more other queues.
- * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree).
- * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree).
- * @sort_list: sorted list of pending requests.
- * @next_rq: if fifo isn't expired, next request to serve.
- * @queued: nr of requests queued in @sort_list.
- * @allocated: currently allocated requests.
- * @meta_pending: pending metadata requests.
- * @fifo: fifo list of requests in sort_list.
- * @entity: entity representing this queue in the scheduler.
- * @max_budget: maximum budget allowed from the feedback mechanism.
- * @budget_timeout: budget expiration (in jiffies).
- * @dispatched: number of requests on the dispatch list or inside driver.
- * @flags: status flags.
- * @bfqq_list: node for active/idle bfqq list inside our bfqd.
- * @burst_list_node: node for the device's burst list.
- * @seek_samples: number of seeks sampled
- * @seek_total: sum of the distances of the seeks sampled
- * @seek_mean: mean seek distance
- * @last_request_pos: position of the last request enqueued
- * @requests_within_timer: number of consecutive pairs of request completion
- *                         and arrival, such that the queue becomes idle
- *                         after the completion, but the next request arrives
- *                         within an idle time slice; used only if the queue's
- *                         IO_bound has been cleared.
- * @pid: pid of the process owning the queue, used for logging purposes.
- * @last_wr_start_finish: start time of the current weight-raising period if
- *                        the @bfq-queue is being weight-raised, otherwise
- *                        finish time of the last weight-raising period
- * @wr_cur_max_time: current max raising time for this queue
- * @soft_rt_next_start: minimum time instant such that, only if a new
- *                      request is enqueued after this time instant in an
- *                      idle @bfq_queue with no outstanding requests, then
- *                      the task associated with the queue it is deemed as
- *                      soft real-time (see the comments to the function
- *                      bfq_bfqq_softrt_next_start())
- * @last_idle_bklogged: time of the last transition of the @bfq_queue from
- *                      idle to backlogged
- * @service_from_backlogged: cumulative service received from the @bfq_queue
- *                           since the last transition from idle to
- *                           backlogged
- * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the
- *	 queue is shared
  *
  * A bfq_queue is a leaf request queue; it can be associated with an
  * io_context or more, if it  is  async or shared  between  cooperating
@@ -235,117 +198,175 @@ struct bfq_group;
  * All the fields are protected by the queue lock of the containing bfqd.
  */
 struct bfq_queue {
-	atomic_t ref;
+	/* reference counter */
+	int ref;
+	/* parent bfq_data */
 	struct bfq_data *bfqd;
 
-	unsigned short ioprio, new_ioprio;
-	unsigned short ioprio_class, new_ioprio_class;
+	/* current ioprio and ioprio class */
+	unsigned short ioprio, ioprio_class;
+	/* next ioprio and ioprio class if a change is in progress */
+	unsigned short new_ioprio, new_ioprio_class;
 
-	/* fields for cooperating queues handling */
+	/*
+	 * Shared bfq_queue if queue is cooperating with one or more
+	 * other queues.
+	 */
 	struct bfq_queue *new_bfqq;
+	/* request-position tree member (see bfq_group's @rq_pos_tree) */
 	struct rb_node pos_node;
+	/* request-position tree root (see bfq_group's @rq_pos_tree) */
 	struct rb_root *pos_root;
 
+	/* sorted list of pending requests */
 	struct rb_root sort_list;
+	/* if fifo isn't expired, next request to serve */
 	struct request *next_rq;
+	/* number of sync and async requests queued */
 	int queued[2];
+	/* number of sync and async requests currently allocated */
 	int allocated[2];
+	/* number of pending metadata requests */
 	int meta_pending;
+	/* fifo list of requests in sort_list */
 	struct list_head fifo;
 
+	/* entity representing this queue in the scheduler */
 	struct bfq_entity entity;
 
+	/* maximum budget allowed from the feedback mechanism */
 	int max_budget;
+	/* budget expiration (in jiffies) */
 	unsigned long budget_timeout;
 
+	/* number of requests on the dispatch list or inside driver */
 	int dispatched;
 
-	unsigned int flags;
+	unsigned int flags; /* status flags.*/
 
+	/* node for active/idle bfqq list inside parent bfqd */
 	struct list_head bfqq_list;
 
+	/* bit vector: a 1 for each seeky requests in history */
+	u32 seek_history;
+
+	/* node for the device's burst list */
 	struct hlist_node burst_list_node;
 
-	unsigned int seek_samples;
-	u64 seek_total;
-	sector_t seek_mean;
+	/* position of the last request enqueued */
 	sector_t last_request_pos;
 
+	/* Number of consecutive pairs of request completion and
+	 * arrival, such that the queue becomes idle after the
+	 * completion, but the next request arrives within an idle
+	 * time slice; used only if the queue's IO_bound flag has been
+	 * cleared.
+	 */
 	unsigned int requests_within_timer;
 
+	/* pid of the process owning the queue, used for logging purposes */
 	pid_t pid;
+
+	/*
+	 * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL
+	 * if the queue is shared.
+	 */
 	struct bfq_io_cq *bic;
 
-	/* weight-raising fields */
+	/* current maximum weight-raising time for this queue */
 	unsigned long wr_cur_max_time;
+	/*
+	 * Minimum time instant such that, only if a new request is
+	 * enqueued after this time instant in an idle @bfq_queue with
+	 * no outstanding requests, then the task associated with the
+	 * queue it is deemed as soft real-time (see the comments on
+	 * the function bfq_bfqq_softrt_next_start())
+	 */
 	unsigned long soft_rt_next_start;
+	/*
+	 * Start time of the current weight-raising period if
+	 * the @bfq-queue is being weight-raised, otherwise
+	 * finish time of the last weight-raising period.
+	 */
 	unsigned long last_wr_start_finish;
+	/* factor by which the weight of this queue is multiplied */
 	unsigned int wr_coeff;
+	/*
+	 * Time of the last transition of the @bfq_queue from idle to
+	 * backlogged.
+	 */
 	unsigned long last_idle_bklogged;
+	/*
+	 * Cumulative service received from the @bfq_queue since the
+	 * last transition from idle to backlogged.
+	 */
 	unsigned long service_from_backlogged;
+	/*
+	 * Value of wr start time when switching to soft rt
+	 */
+	unsigned long wr_start_at_switch_to_srt;
+
+	unsigned long split_time; /* time of last split */
 };
 
 /**
  * struct bfq_ttime - per process thinktime stats.
- * @ttime_total: total process thinktime
- * @ttime_samples: number of thinktime samples
- * @ttime_mean: average process thinktime
  */
 struct bfq_ttime {
-	unsigned long last_end_request;
+	u64 last_end_request; /* completion time of last request */
+
+	u64 ttime_total; /* total process thinktime */
+	unsigned long ttime_samples; /* number of thinktime samples */
+	u64 ttime_mean; /* average process thinktime */
 
-	unsigned long ttime_total;
-	unsigned long ttime_samples;
-	unsigned long ttime_mean;
 };
 
 /**
  * struct bfq_io_cq - per (request_queue, io_context) structure.
- * @icq: associated io_cq structure
- * @bfqq: array of two process queues, the sync and the async
- * @ttime: associated @bfq_ttime struct
- * @ioprio: per (request_queue, blkcg) ioprio.
- * @blkcg_id: id of the blkcg the related io_cq belongs to.
- * @wr_time_left: snapshot of the time left before weight raising ends
- *                for the sync queue associated to this process; this
- *		  snapshot is taken to remember this value while the weight
- *		  raising is suspended because the queue is merged with a
- *		  shared queue, and is used to set @raising_cur_max_time
- *		  when the queue is split from the shared queue and its
- *		  weight is raised again
- * @saved_idle_window: same purpose as the previous field for the idle
- *                     window
- * @saved_IO_bound: same purpose as the previous two fields for the I/O
- *                  bound classification of a queue
- * @saved_in_large_burst: same purpose as the previous fields for the
- *                        value of the field keeping the queue's belonging
- *                        to a large burst
- * @was_in_burst_list: true if the queue belonged to a burst list
- *                     before its merge with another cooperating queue
- * @cooperations: counter of consecutive successful queue merges underwent
- *                by any of the process' @bfq_queues
- * @failed_cooperations: counter of consecutive failed queue merges of any
- *                       of the process' @bfq_queues
  */
 struct bfq_io_cq {
+	/* associated io_cq structure */
 	struct io_cq icq; /* must be the first member */
+	/* array of two process queues, the sync and the async */
 	struct bfq_queue *bfqq[2];
+	/* associated @bfq_ttime struct */
 	struct bfq_ttime ttime;
+	/* per (request_queue, blkcg) ioprio */
 	int ioprio;
-
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
-	uint64_t blkcg_id; /* the current blkcg ID */
+	uint64_t blkcg_serial_nr; /* the current blkcg serial */
 #endif
 
-	unsigned int wr_time_left;
+	/*
+	 * Snapshot of the idle window before merging; taken to
+	 * remember this value while the queue is merged, so as to be
+	 * able to restore it in case of split.
+	 */
 	bool saved_idle_window;
+	/*
+	 * Same purpose as the previous two fields for the I/O bound
+	 * classification of a queue.
+	 */
 	bool saved_IO_bound;
 
+	/*
+	 * Same purpose as the previous fields for the value of the
+	 * field keeping the queue's belonging to a large burst
+	 */
 	bool saved_in_large_burst;
+	/*
+	 * True if the queue belonged to a burst list before its merge
+	 * with another cooperating queue.
+	 */
 	bool was_in_burst_list;
 
-	unsigned int cooperations;
-	unsigned int failed_cooperations;
+	/*
+	 * Similar to previous fields: save wr information.
+	 */
+	unsigned long saved_wr_coeff;
+	unsigned long saved_last_wr_start_finish;
+	unsigned long saved_wr_start_at_switch_to_srt;
+	unsigned int saved_wr_cur_max_time;
 };
 
 enum bfq_device_speed {
@@ -354,224 +375,232 @@ enum bfq_device_speed {
 };
 
 /**
- * struct bfq_data - per device data structure.
- * @queue: request queue for the managed device.
- * @root_group: root bfq_group for the device.
- * @active_numerous_groups: number of bfq_groups containing more than one
- *                          active @bfq_entity.
- * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by
- *                      weight. Used to keep track of whether all @bfq_queues
- *                     have the same weight. The tree contains one counter
- *                     for each distinct weight associated to some active
- *                     and not weight-raised @bfq_queue (see the comments to
- *                      the functions bfq_weights_tree_[add|remove] for
- *                     further details).
- * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted
- *                      by weight. Used to keep track of whether all
- *                     @bfq_groups have the same weight. The tree contains
- *                     one counter for each distinct weight associated to
- *                     some active @bfq_group (see the comments to the
- *                     functions bfq_weights_tree_[add|remove] for further
- *                     details).
- * @busy_queues: number of bfq_queues containing requests (including the
- *		 queue in service, even if it is idling).
- * @busy_in_flight_queues: number of @bfq_queues containing pending or
- *                         in-flight requests, plus the @bfq_queue in
- *                         service, even if idle but waiting for the
- *                         possible arrival of its next sync request. This
- *                         field is updated only if the device is rotational,
- *                         but used only if the device is also NCQ-capable.
- *                         The reason why the field is updated also for non-
- *                         NCQ-capable rotational devices is related to the
- *                         fact that the value of @hw_tag may be set also
- *                         later than when busy_in_flight_queues may need to
- *                         be incremented for the first time(s). Taking also
- *                         this possibility into account, to avoid unbalanced
- *                         increments/decrements, would imply more overhead
- *                         than just updating busy_in_flight_queues
- *                         regardless of the value of @hw_tag.
- * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues
- *                                     (that is, seeky queues that expired
- *                                     for budget timeout at least once)
- *                                     containing pending or in-flight
- *                                     requests, including the in-service
- *                                     @bfq_queue if constantly seeky. This
- *                                     field is updated only if the device
- *                                     is rotational, but used only if the
- *                                     device is also NCQ-capable (see the
- *                                     comments to @busy_in_flight_queues).
- * @wr_busy_queues: number of weight-raised busy @bfq_queues.
- * @queued: number of queued requests.
- * @rq_in_driver: number of requests dispatched and waiting for completion.
- * @sync_flight: number of sync requests in the driver.
- * @max_rq_in_driver: max number of reqs in driver in the last
- *                    @hw_tag_samples completed requests.
- * @hw_tag_samples: nr of samples used to calculate hw_tag.
- * @hw_tag: flag set to one if the driver is showing a queueing behavior.
- * @budgets_assigned: number of budgets assigned.
- * @idle_slice_timer: timer set when idling for the next sequential request
- *                    from the queue in service.
- * @unplug_work: delayed work to restart dispatching on the request queue.
- * @in_service_queue: bfq_queue in service.
- * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue.
- * @last_position: on-disk position of the last served request.
- * @last_budget_start: beginning of the last budget.
- * @last_idling_start: beginning of the last idle slice.
- * @peak_rate: peak transfer rate observed for a budget.
- * @peak_rate_samples: number of samples used to calculate @peak_rate.
- * @bfq_max_budget: maximum budget allotted to a bfq_queue before
- *                  rescheduling.
- * @active_list: list of all the bfq_queues active on the device.
- * @idle_list: list of all the bfq_queues idle on the device.
- * @bfq_fifo_expire: timeout for async/sync requests; when it expires
- *                   requests are served in fifo order.
- * @bfq_back_penalty: weight of backward seeks wrt forward ones.
- * @bfq_back_max: maximum allowed backward seek.
- * @bfq_slice_idle: maximum idling time.
- * @bfq_user_max_budget: user-configured max budget value
- *                       (0 for auto-tuning).
- * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
- *                           async queues.
- * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
- *               to prevent seeky queues to impose long latencies to well
- *               behaved ones (this also implies that seeky queues cannot
- *               receive guarantees in the service domain; after a timeout
- *               they are charged for the whole allocated budget, to try
- *               to preserve a behavior reasonably fair among them, but
- *               without service-domain guarantees).
- * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is
- *                   no more granted any weight-raising.
- * @bfq_failed_cooperations: number of consecutive failed cooperation
- *                           chances after which weight-raising is restored
- *                           to a queue subject to more than bfq_coop_thresh
- *                           queue merges.
- * @bfq_requests_within_timer: number of consecutive requests that must be
- *                             issued within the idle time slice to set
- *                             again idling to a queue which was marked as
- *                             non-I/O-bound (see the definition of the
- *                             IO_bound flag for further details).
- * @last_ins_in_burst: last time at which a queue entered the current
- *                     burst of queues being activated shortly after
- *                     each other; for more details about this and the
- *                     following parameters related to a burst of
- *                     activations, see the comments to the function
- *                     @bfq_handle_burst.
- * @bfq_burst_interval: reference time interval used to decide whether a
- *                      queue has been activated shortly after
- *                      @last_ins_in_burst.
- * @burst_size: number of queues in the current burst of queue activations.
- * @bfq_large_burst_thresh: maximum burst size above which the current
- *			    queue-activation burst is deemed as 'large'.
- * @large_burst: true if a large queue-activation burst is in progress.
- * @burst_list: head of the burst list (as for the above fields, more details
- *		in the comments to the function bfq_handle_burst).
- * @low_latency: if set to true, low-latency heuristics are enabled.
- * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised
- *                queue is multiplied.
- * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies).
- * @bfq_wr_rt_max_time: maximum duration for soft real-time processes.
- * @bfq_wr_min_idle_time: minimum idle period after which weight-raising
- *			  may be reactivated for a queue (in jiffies).
- * @bfq_wr_min_inter_arr_async: minimum period between request arrivals
- *				after which weight-raising may be
- *				reactivated for an already busy queue
- *				(in jiffies).
- * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue,
- *			    sectors per seconds.
- * @RT_prod: cached value of the product R*T used for computing the maximum
- *	     duration of the weight raising automatically.
- * @device_speed: device-speed class for the low-latency heuristic.
- * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions.
+ * struct bfq_data - per-device data structure.
  *
  * All the fields are protected by the @queue lock.
  */
 struct bfq_data {
+	/* request queue for the device */
 	struct request_queue *queue;
 
+	/* root bfq_group for the device */
 	struct bfq_group *root_group;
 
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
-	int active_numerous_groups;
-#endif
-
+	/*
+	 * rbtree of weight counters of @bfq_queues, sorted by
+	 * weight. Used to keep track of whether all @bfq_queues have
+	 * the same weight. The tree contains one counter for each
+	 * distinct weight associated to some active and not
+	 * weight-raised @bfq_queue (see the comments to the functions
+	 * bfq_weights_tree_[add|remove] for further details).
+	 */
 	struct rb_root queue_weights_tree;
+	/*
+	 * rbtree of non-queue @bfq_entity weight counters, sorted by
+	 * weight. Used to keep track of whether all @bfq_groups have
+	 * the same weight. The tree contains one counter for each
+	 * distinct weight associated to some active @bfq_group (see
+	 * the comments to the functions bfq_weights_tree_[add|remove]
+	 * for further details).
+	 */
 	struct rb_root group_weights_tree;
 
+	/*
+	 * Number of bfq_queues containing requests (including the
+	 * queue in service, even if it is idling).
+	 */
 	int busy_queues;
-	int busy_in_flight_queues;
-	int const_seeky_busy_in_flight_queues;
+	/* number of weight-raised busy @bfq_queues */
 	int wr_busy_queues;
+	/* number of queued requests */
 	int queued;
+	/* number of requests dispatched and waiting for completion */
 	int rq_in_driver;
-	int sync_flight;
 
+	/*
+	 * Maximum number of requests in driver in the last
+	 * @hw_tag_samples completed requests.
+	 */
 	int max_rq_in_driver;
+	/* number of samples used to calculate hw_tag */
 	int hw_tag_samples;
+	/* flag set to one if the driver is showing a queueing behavior */
 	int hw_tag;
 
+	/* number of budgets assigned */
 	int budgets_assigned;
 
-	struct timer_list idle_slice_timer;
+	/*
+	 * Timer set when idling (waiting) for the next request from
+	 * the queue in service.
+	 */
+	struct hrtimer idle_slice_timer;
+	/* delayed work to restart dispatching on the request queue */
 	struct work_struct unplug_work;
 
+	/* bfq_queue in service */
 	struct bfq_queue *in_service_queue;
+	/* bfq_io_cq (bic) associated with the @in_service_queue */
 	struct bfq_io_cq *in_service_bic;
 
+	/* on-disk position of the last served request */
 	sector_t last_position;
 
+	/* time of last request completion (ns) */
+	u64 last_completion;
+
+	/* time of first rq dispatch in current observation interval (ns) */
+	u64 first_dispatch;
+	/* time of last rq dispatch in current observation interval (ns) */
+	u64 last_dispatch;
+
+	/* beginning of the last budget */
 	ktime_t last_budget_start;
+	/* beginning of the last idle slice */
 	ktime_t last_idling_start;
+
+	/* number of samples in current observation interval */
 	int peak_rate_samples;
-	u64 peak_rate;
+	/* num of samples of seq dispatches in current observation interval */
+	u32 sequential_samples;
+	/* total num of sectors transferred in current observation interval */
+	u64 tot_sectors_dispatched;
+	/* max rq size seen during current observation interval (sectors) */
+	u32 last_rq_max_size;
+	/* time elapsed from first dispatch in current observ. interval (us) */
+	u64 delta_from_first;
+	/* current estimate of device peak rate */
+	u32 peak_rate;
+
+	/* maximum budget allotted to a bfq_queue before rescheduling */
 	int bfq_max_budget;
 
+	/* list of all the bfq_queues active on the device */
 	struct list_head active_list;
+	/* list of all the bfq_queues idle on the device */
 	struct list_head idle_list;
 
-	unsigned int bfq_fifo_expire[2];
+	/*
+	 * Timeout for async/sync requests; when it fires, requests
+	 * are served in fifo order.
+	 */
+	u64 bfq_fifo_expire[2];
+	/* weight of backward seeks wrt forward ones */
 	unsigned int bfq_back_penalty;
+	/* maximum allowed backward seek */
 	unsigned int bfq_back_max;
-	unsigned int bfq_slice_idle;
-	u64 bfq_class_idle_last_service;
+	/* maximum idling time */
+	u32 bfq_slice_idle;
 
+	/* user-configured max budget value (0 for auto-tuning) */
 	int bfq_user_max_budget;
-	int bfq_max_budget_async_rq;
-	unsigned int bfq_timeout[2];
-
-	unsigned int bfq_coop_thresh;
-	unsigned int bfq_failed_cooperations;
+	/*
+	 * Timeout for bfq_queues to consume their budget; used to
+	 * prevent seeky queues from imposing long latencies to
+	 * sequential or quasi-sequential ones (this also implies that
+	 * seeky queues cannot receive guarantees in the service
+	 * domain; after a timeout they are charged for the time they
+	 * have been in service, to preserve fairness among them, but
+	 * without service-domain guarantees).
+	 */
+	unsigned int bfq_timeout;
+
+	/*
+	 * Number of consecutive requests that must be issued within
+	 * the idle time slice to set again idling to a queue which
+	 * was marked as non-I/O-bound (see the definition of the
+	 * IO_bound flag for further details).
+	 */
 	unsigned int bfq_requests_within_timer;
 
+	/*
+	 * Force device idling whenever needed to provide accurate
+	 * service guarantees, without caring about throughput
+	 * issues. CAVEAT: this may even increase latencies, in case
+	 * of useless idling for processes that did stop doing I/O.
+	 */
+	bool strict_guarantees;
+
+	/*
+	 * Last time at which a queue entered the current burst of
+	 * queues being activated shortly after each other; for more
+	 * details about this and the following parameters related to
+	 * a burst of activations, see the comments on the function
+	 * bfq_handle_burst.
+	 */
 	unsigned long last_ins_in_burst;
+	/*
+	 * Reference time interval used to decide whether a queue has
+	 * been activated shortly after @last_ins_in_burst.
+	 */
 	unsigned long bfq_burst_interval;
+	/* number of queues in the current burst of queue activations */
 	int burst_size;
+
+	/* common parent entity for the queues in the burst */
+	struct bfq_entity *burst_parent_entity;
+	/* Maximum burst size above which the current queue-activation
+	 * burst is deemed as 'large'.
+	 */
 	unsigned long bfq_large_burst_thresh;
+	/* true if a large queue-activation burst is in progress */
 	bool large_burst;
+	/*
+	 * Head of the burst list (as for the above fields, more
+	 * details in the comments on the function bfq_handle_burst).
+	 */
 	struct hlist_head burst_list;
 
+	/* if set to true, low-latency heuristics are enabled */
 	bool low_latency;
-
-	/* parameters of the low_latency heuristics */
+	/*
+	 * Maximum factor by which the weight of a weight-raised queue
+	 * is multiplied.
+	 */
 	unsigned int bfq_wr_coeff;
+	/* maximum duration of a weight-raising period (jiffies) */
 	unsigned int bfq_wr_max_time;
+
+	/* Maximum weight-raising duration for soft real-time processes */
 	unsigned int bfq_wr_rt_max_time;
+	/*
+	 * Minimum idle period after which weight-raising may be
+	 * reactivated for a queue (in jiffies).
+	 */
 	unsigned int bfq_wr_min_idle_time;
+	/*
+	 * Minimum period between request arrivals after which
+	 * weight-raising may be reactivated for an already busy async
+	 * queue (in jiffies).
+	 */
 	unsigned long bfq_wr_min_inter_arr_async;
+
+	/* Max service-rate for a soft real-time queue, in sectors/sec */
 	unsigned int bfq_wr_max_softrt_rate;
+	/*
+	 * Cached value of the product R*T, used for computing the
+	 * maximum duration of weight raising automatically.
+	 */
 	u64 RT_prod;
+	/* device-speed class for the low-latency heuristic */
 	enum bfq_device_speed device_speed;
 
+	/* fallback dummy bfqq for extreme OOM conditions */
 	struct bfq_queue oom_bfqq;
 };
 
 enum bfqq_state_flags {
-	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is in service */
+	BFQ_BFQQ_FLAG_just_created = 0,	/* queue just allocated */
+	BFQ_BFQQ_FLAG_busy,		/* has requests or is in service */
 	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */
+	BFQ_BFQQ_FLAG_non_blocking_wait_rq, /*
+					     * waiting for a request
+					     * without idling the device
+					     */
 	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */
 	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
 	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */
 	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */
-	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */
 	BFQ_BFQQ_FLAG_IO_bound,		/*
 					 * bfqq has timed-out at least once
 					 * having consumed at most 2/10 of
@@ -581,17 +610,12 @@ enum bfqq_state_flags {
 					 * bfqq activated in a large burst,
 					 * see comments to bfq_handle_burst.
 					 */
-	BFQ_BFQQ_FLAG_constantly_seeky,	/*
-					 * bfqq has proved to be slow and
-					 * seeky until budget timeout
-					 */
 	BFQ_BFQQ_FLAG_softrt_update,	/*
 					 * may need softrt-next-start
 					 * update
 					 */
 	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */
-	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be split */
-	BFQ_BFQQ_FLAG_just_split,	/* queue has just been split */
+	BFQ_BFQQ_FLAG_split_coop	/* shared bfqq will be split */
 };
 
 #define BFQ_BFQQ_FNS(name)						\
@@ -608,28 +632,94 @@ static int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\
 	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\
 }
 
+BFQ_BFQQ_FNS(just_created);
 BFQ_BFQQ_FNS(busy);
 BFQ_BFQQ_FNS(wait_request);
+BFQ_BFQQ_FNS(non_blocking_wait_rq);
 BFQ_BFQQ_FNS(must_alloc);
 BFQ_BFQQ_FNS(fifo_expire);
 BFQ_BFQQ_FNS(idle_window);
 BFQ_BFQQ_FNS(sync);
-BFQ_BFQQ_FNS(budget_new);
 BFQ_BFQQ_FNS(IO_bound);
 BFQ_BFQQ_FNS(in_large_burst);
-BFQ_BFQQ_FNS(constantly_seeky);
 BFQ_BFQQ_FNS(coop);
 BFQ_BFQQ_FNS(split_coop);
-BFQ_BFQQ_FNS(just_split);
 BFQ_BFQQ_FNS(softrt_update);
 #undef BFQ_BFQQ_FNS
 
 /* Logging facilities. */
-#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
-	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
+#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	do {			\
+	char __pbuf[128];						\
+									\
+	assert_spin_locked((bfqd)->queue->queue_lock);			\
+	blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
+	pr_crit("bfq%d%c %s " fmt "\n", 			\
+		(bfqq)->pid,						\
+		bfq_bfqq_sync((bfqq)) ? 'S' : 'A',			\
+		__pbuf, ##args);					\
+} while (0)
+
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)	do {			\
+	char __pbuf[128];						\
+									\
+	blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf));		\
+	pr_crit("%s " fmt "\n", __pbuf, ##args);	\
+} while (0)
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)		\
+	pr_crit("bfq%d%c " fmt "\n", (bfqq)->pid,		\
+		bfq_bfqq_sync((bfqq)) ? 'S' : 'A',	\
+		##args)
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)		do {} while (0)
+
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+
+#define bfq_log(bfqd, fmt, args...) \
+	pr_crit("bfq " fmt "\n", ##args)
+
+#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	do {			\
+	char __pbuf[128];						\
+									\
+	assert_spin_locked((bfqd)->queue->queue_lock);			\
+	blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
+	blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \
+			  (bfqq)->pid,			  \
+			  bfq_bfqq_sync((bfqq)) ? 'S' : 'A',	\
+			  __pbuf, ##args);				\
+} while (0)
+
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)	do {			\
+	char __pbuf[128];						\
+									\
+	blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf));		\
+	blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args);	\
+} while (0)
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	\
+	blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid,	\
+			bfq_bfqq_sync((bfqq)) ? 'S' : 'A',		\
+				##args)
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)		do {} while (0)
+
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
 
 #define bfq_log(bfqd, fmt, args...) \
 	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
+#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */
 
 /* Expiration reasons. */
 enum bfqq_expiration {
@@ -640,15 +730,12 @@ enum bfqq_expiration {
 	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */
 	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */
 	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */
+	BFQ_BFQQ_PREEMPTED		/* preemption in progress */
 };
 
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
 
 struct bfqg_stats {
-	/* total bytes transferred */
-	struct blkg_rwstat		service_bytes;
-	/* total IOs serviced, post merge */
-	struct blkg_rwstat		serviced;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
 	/* number of ios merged */
 	struct blkg_rwstat		merged;
 	/* total time spent on device in ns, may not be accurate w/ queueing */
@@ -657,12 +744,8 @@ struct bfqg_stats {
 	struct blkg_rwstat		wait_time;
 	/* number of IOs queued up */
 	struct blkg_rwstat		queued;
-	/* total sectors transferred */
-	struct blkg_stat		sectors;
 	/* total disk time and nr sectors dispatched by this group */
 	struct blkg_stat		time;
-	/* time not charged to this cgroup */
-	struct blkg_stat		unaccounted_time;
 	/* sum of number of ios queued across all samples */
 	struct blkg_stat		avg_queue_size_sum;
 	/* count of samples taken for average */
@@ -680,8 +763,10 @@ struct bfqg_stats {
 	uint64_t			start_idle_time;
 	uint64_t			start_empty_time;
 	uint16_t			flags;
+#endif
 };
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
 /*
  * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
  *
@@ -692,7 +777,7 @@ struct bfq_group_data {
 	/* must be the first member */
 	struct blkcg_policy_data pd;
 
-	unsigned short weight;
+	unsigned int weight;
 };
 
 /**
@@ -712,7 +797,7 @@ struct bfq_group_data {
  *                   unused for the root group. Used to know whether there
  *                   are groups with more than one active @bfq_entity
  *                   (see the comments to the function
- *                   bfq_bfqq_must_not_expire()).
+ *                   bfq_bfqq_may_idle()).
  * @rq_pos_tree: rbtree sorted by next_request position, used when
  *               determining if two or more queues have interleaving
  *               requests (see bfq_find_close_cooperator()).
@@ -745,7 +830,6 @@ struct bfq_group {
 	struct rb_root rq_pos_tree;
 
 	struct bfqg_stats stats;
-	struct bfqg_stats dead_stats;	/* stats pushed from dead children */
 };
 
 #else
@@ -761,17 +845,38 @@ struct bfq_group {
 
 static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
 
+static unsigned int bfq_class_idx(struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	return bfqq ? bfqq->ioprio_class - 1 :
+		BFQ_DEFAULT_GRP_CLASS - 1;
+}
+
 static struct bfq_service_tree *
 bfq_entity_service_tree(struct bfq_entity *entity)
 {
 	struct bfq_sched_data *sched_data = entity->sched_data;
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	unsigned int idx = bfqq ? bfqq->ioprio_class - 1 :
-				  BFQ_DEFAULT_GRP_CLASS;
+	unsigned int idx = bfq_class_idx(entity);
 
 	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
 	BUG_ON(sched_data == NULL);
 
+	if (bfqq)
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			     "entity_service_tree %p %d",
+			     sched_data->service_tree + idx, idx);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	else {
+		struct bfq_group *bfqg =
+			container_of(entity, struct bfq_group, entity);
+
+		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+			     "entity_service_tree %p %d",
+			     sched_data->service_tree + idx, idx);
+	}
+#endif
 	return sched_data->service_tree + idx;
 }
 
@@ -791,47 +896,6 @@ static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
 	return bic->icq.q->elevator->elevator_data;
 }
 
-/**
- * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
- * @ptr: a pointer to a bfqd.
- * @flags: storage for the flags to be saved.
- *
- * This function allows bfqg->bfqd to be protected by the
- * queue lock of the bfqd they reference; the pointer is dereferenced
- * under RCU, so the storage for bfqd is assured to be safe as long
- * as the RCU read side critical section does not end.  After the
- * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
- * sure that no other writer accessed it.  If we raced with a writer,
- * the function returns NULL, with the queue unlocked, otherwise it
- * returns the dereferenced pointer, with the queue locked.
- */
-static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags)
-{
-	struct bfq_data *bfqd;
-
-	rcu_read_lock();
-	bfqd = rcu_dereference(*(struct bfq_data **)ptr);
-
-	if (bfqd != NULL) {
-		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
-		if (ptr == NULL)
-			printk(KERN_CRIT "get_bfqd_locked pointer NULL\n");
-		else if (*ptr == bfqd)
-			goto out;
-		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
-	}
-
-	bfqd = NULL;
-out:
-	rcu_read_unlock();
-	return bfqd;
-}
-
-static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags)
-{
-	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
-}
-
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 
 static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
@@ -857,11 +921,13 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);
 static void bfq_put_queue(struct bfq_queue *bfqq);
 static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
 static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
-				       struct bio *bio, int is_sync,
-				       struct bfq_io_cq *bic, gfp_t gfp_mask);
+				       struct bio *bio, bool is_sync,
+				       struct bfq_io_cq *bic);
 static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
 				    struct bfq_group *bfqg);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
 static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
+#endif
 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
 
 #endif /* _BFQ_H */
-- 
2.10.0