~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/smb/client/compress.c

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-only
  2 /*
  3  * Copyright (C) 2024, SUSE LLC
  4  *
  5  * Authors: Enzo Matsumiya <ematsumiya@suse.de>
  6  *
  7  * This file implements I/O compression support for SMB2 messages (SMB 3.1.1 only).
  8  * See compress/ for implementation details of each algorithm.
  9  *
 10  * References:
 11  * MS-SMB2 "3.1.4.4 Compressing the Message"
 12  * MS-SMB2 "3.1.5.3 Decompressing the Chained Message"
 13  * MS-XCA - for details of the supported algorithms
 14  */
 15 #include <linux/slab.h>
 16 #include <linux/kernel.h>
 17 #include <linux/uio.h>
 18 #include <linux/sort.h>
 19 
 20 #include "cifsglob.h"
 21 #include "../common/smb2pdu.h"
 22 #include "cifsproto.h"
 23 #include "smb2proto.h"
 24 
 25 #include "compress/lz77.h"
 26 #include "compress.h"
 27 
 28 /*
 29  * The heuristic_*() functions below try to determine data compressibility.
 30  *
 31  * Derived from fs/btrfs/compression.c, changing coding style, some parameters, and removing
 32  * unused parts.
 33  *
 34  * Read that file for better and more detailed explanation of the calculations.
 35  *
 36  * The algorithms are ran in a collected sample of the input (uncompressed) data.
 37  * The sample is formed of 2K reads in PAGE_SIZE intervals, with a maximum size of 4M.
 38  *
 39  * Parsing the sample goes from "low-hanging fruits" (fastest algorithms, likely compressible)
 40  * to "need more analysis" (likely uncompressible).
 41  */
 42 
 43 struct bucket {
 44         unsigned int count;
 45 };
 46 
 47 /**
 48  * has_low_entropy() - Compute Shannon entropy of the sampled data.
 49  * @bkt:        Bytes counts of the sample.
 50  * @slen:       Size of the sample.
 51  *
 52  * Return: true if the level (percentage of number of bits that would be required to
 53  *         compress the data) is below the minimum threshold.
 54  *
 55  * Note:
 56  * There _is_ an entropy level here that's > 65 (minimum threshold) that would indicate a
 57  * possibility of compression, but compressing, or even further analysing, it would waste so much
 58  * resources that it's simply not worth it.
 59  *
 60  * Also Shannon entropy is the last computed heuristic; if we got this far and ended up
 61  * with uncertainty, just stay on the safe side and call it uncompressible.
 62  */
 63 static bool has_low_entropy(struct bucket *bkt, size_t slen)
 64 {
 65         const size_t threshold = 65, max_entropy = 8 * ilog2(16);
 66         size_t i, p, p2, len, sum = 0;
 67 
 68 #define pow4(n) (n * n * n * n)
 69         len = ilog2(pow4(slen));
 70 
 71         for (i = 0; i < 256 && bkt[i].count > 0; i++) {
 72                 p = bkt[i].count;
 73                 p2 = ilog2(pow4(p));
 74                 sum += p * (len - p2);
 75         }
 76 
 77         sum /= slen;
 78 
 79         return ((sum * 100 / max_entropy) <= threshold);
 80 }
 81 
 82 #define BYTE_DIST_BAD           0
 83 #define BYTE_DIST_GOOD          1
 84 #define BYTE_DIST_MAYBE         2
 85 /**
 86  * calc_byte_distribution() - Compute byte distribution on the sampled data.
 87  * @bkt:        Byte counts of the sample.
 88  * @slen:       Size of the sample.
 89  *
 90  * Return:
 91  * BYTE_DIST_BAD:       A "hard no" for compression -- a computed uniform distribution of
 92  *                      the bytes (e.g. random or encrypted data).
 93  * BYTE_DIST_GOOD:      High probability (normal (Gaussian) distribution) of the data being
 94  *                      compressible.
 95  * BYTE_DIST_MAYBE:     When computed byte distribution resulted in "low > n < high"
 96  *                      grounds.  has_low_entropy() should be used for a final decision.
 97  */
 98 static int calc_byte_distribution(struct bucket *bkt, size_t slen)
 99 {
100         const size_t low = 64, high = 200, threshold = slen * 90 / 100;
101         size_t sum = 0;
102         int i;
103 
104         for (i = 0; i < low; i++)
105                 sum += bkt[i].count;
106 
107         if (sum > threshold)
108                 return BYTE_DIST_BAD;
109 
110         for (; i < high && bkt[i].count > 0; i++) {
111                 sum += bkt[i].count;
112                 if (sum > threshold)
113                         break;
114         }
115 
116         if (i <= low)
117                 return BYTE_DIST_GOOD;
118 
119         if (i >= high)
120                 return BYTE_DIST_BAD;
121 
122         return BYTE_DIST_MAYBE;
123 }
124 
125 static bool is_mostly_ascii(const struct bucket *bkt)
126 {
127         size_t count = 0;
128         int i;
129 
130         for (i = 0; i < 256; i++)
131                 if (bkt[i].count > 0)
132                         /* Too many non-ASCII (0-63) bytes. */
133                         if (++count > 64)
134                                 return false;
135 
136         return true;
137 }
138 
139 static bool has_repeated_data(const u8 *sample, size_t len)
140 {
141         size_t s = len / 2;
142 
143         return (!memcmp(&sample[0], &sample[s], s));
144 }
145 
146 static int cmp_bkt(const void *_a, const void *_b)
147 {
148         const struct bucket *a = _a, *b = _b;
149 
150         /* Reverse sort. */
151         if (a->count > b->count)
152                 return -1;
153 
154         return 1;
155 }
156 
157 /*
158  * TODO:
159  * Support other iter types, if required.
160  * Only ITER_XARRAY is supported for now.
161  */
162 static int collect_sample(const struct iov_iter *iter, ssize_t max, u8 *sample)
163 {
164         struct folio *folios[16], *folio;
165         unsigned int nr, i, j, npages;
166         loff_t start = iter->xarray_start + iter->iov_offset;
167         pgoff_t last, index = start / PAGE_SIZE;
168         size_t len, off, foff;
169         void *p;
170         int s = 0;
171 
172         last = (start + max - 1) / PAGE_SIZE;
173         do {
174                 nr = xa_extract(iter->xarray, (void **)folios, index, last, ARRAY_SIZE(folios),
175                                 XA_PRESENT);
176                 if (nr == 0)
177                         return -EIO;
178 
179                 for (i = 0; i < nr; i++) {
180                         folio = folios[i];
181                         npages = folio_nr_pages(folio);
182                         foff = start - folio_pos(folio);
183                         off = foff % PAGE_SIZE;
184 
185                         for (j = foff / PAGE_SIZE; j < npages; j++) {
186                                 size_t len2;
187 
188                                 len = min_t(size_t, max, PAGE_SIZE - off);
189                                 len2 = min_t(size_t, len, SZ_2K);
190 
191                                 p = kmap_local_page(folio_page(folio, j));
192                                 memcpy(&sample[s], p, len2);
193                                 kunmap_local(p);
194 
195                                 s += len2;
196 
197                                 if (len2 < SZ_2K || s >= max - SZ_2K)
198                                         return s;
199 
200                                 max -= len;
201                                 if (max <= 0)
202                                         return s;
203 
204                                 start += len;
205                                 off = 0;
206                                 index++;
207                         }
208                 }
209         } while (nr == ARRAY_SIZE(folios));
210 
211         return s;
212 }
213 
214 /**
215  * is_compressible() - Determines if a chunk of data is compressible.
216  * @data: Iterator containing uncompressed data.
217  *
218  * Return: true if @data is compressible, false otherwise.
219  *
220  * Tests shows that this function is quite reliable in predicting data compressibility,
221  * matching close to 1:1 with the behaviour of LZ77 compression success and failures.
222  */
223 static bool is_compressible(const struct iov_iter *data)
224 {
225         const size_t read_size = SZ_2K, bkt_size = 256, max = SZ_4M;
226         struct bucket *bkt = NULL;
227         size_t len;
228         u8 *sample;
229         bool ret = false;
230         int i;
231 
232         /* Preventive double check -- already checked in should_compress(). */
233         len = iov_iter_count(data);
234         if (unlikely(len < read_size))
235                 return ret;
236 
237         if (len - read_size > max)
238                 len = max;
239 
240         sample = kvzalloc(len, GFP_KERNEL);
241         if (!sample) {
242                 WARN_ON_ONCE(1);
243 
244                 return ret;
245         }
246 
247         /* Sample 2K bytes per page of the uncompressed data. */
248         i = collect_sample(data, len, sample);
249         if (i <= 0) {
250                 WARN_ON_ONCE(1);
251 
252                 goto out;
253         }
254 
255         len = i;
256         ret = true;
257 
258         if (has_repeated_data(sample, len))
259                 goto out;
260 
261         bkt = kcalloc(bkt_size, sizeof(*bkt), GFP_KERNEL);
262         if (!bkt) {
263                 WARN_ON_ONCE(1);
264                 ret = false;
265 
266                 goto out;
267         }
268 
269         for (i = 0; i < len; i++)
270                 bkt[sample[i]].count++;
271 
272         if (is_mostly_ascii(bkt))
273                 goto out;
274 
275         /* Sort in descending order */
276         sort(bkt, bkt_size, sizeof(*bkt), cmp_bkt, NULL);
277 
278         i = calc_byte_distribution(bkt, len);
279         if (i != BYTE_DIST_MAYBE) {
280                 ret = !!i;
281 
282                 goto out;
283         }
284 
285         ret = has_low_entropy(bkt, len);
286 out:
287         kvfree(sample);
288         kfree(bkt);
289 
290         return ret;
291 }
292 
293 bool should_compress(const struct cifs_tcon *tcon, const struct smb_rqst *rq)
294 {
295         const struct smb2_hdr *shdr = rq->rq_iov->iov_base;
296 
297         if (unlikely(!tcon || !tcon->ses || !tcon->ses->server))
298                 return false;
299 
300         if (!tcon->ses->server->compression.enabled)
301                 return false;
302 
303         if (!(tcon->share_flags & SMB2_SHAREFLAG_COMPRESS_DATA))
304                 return false;
305 
306         if (shdr->Command == SMB2_WRITE) {
307                 const struct smb2_write_req *wreq = rq->rq_iov->iov_base;
308 
309                 if (le32_to_cpu(wreq->Length) < SMB_COMPRESS_MIN_LEN)
310                         return false;
311 
312                 return is_compressible(&rq->rq_iter);
313         }
314 
315         return (shdr->Command == SMB2_READ);
316 }
317 
318 int smb_compress(struct TCP_Server_Info *server, struct smb_rqst *rq, compress_send_fn send_fn)
319 {
320         struct iov_iter iter;
321         u32 slen, dlen;
322         void *src, *dst = NULL;
323         int ret;
324 
325         if (!server || !rq || !rq->rq_iov || !rq->rq_iov->iov_base)
326                 return -EINVAL;
327 
328         if (rq->rq_iov->iov_len != sizeof(struct smb2_write_req))
329                 return -EINVAL;
330 
331         slen = iov_iter_count(&rq->rq_iter);
332         src = kvzalloc(slen, GFP_KERNEL);
333         if (!src) {
334                 ret = -ENOMEM;
335                 goto err_free;
336         }
337 
338         /* Keep the original iter intact. */
339         iter = rq->rq_iter;
340 
341         if (!copy_from_iter_full(src, slen, &iter)) {
342                 ret = -EIO;
343                 goto err_free;
344         }
345 
346         /*
347          * This is just overprovisioning, as the algorithm will error out if @dst reaches 7/8
348          * of @slen.
349          */
350         dlen = slen;
351         dst = kvzalloc(dlen, GFP_KERNEL);
352         if (!dst) {
353                 ret = -ENOMEM;
354                 goto err_free;
355         }
356 
357         ret = lz77_compress(src, slen, dst, &dlen);
358         if (!ret) {
359                 struct smb2_compression_hdr hdr = { 0 };
360                 struct smb_rqst comp_rq = { .rq_nvec = 3, };
361                 struct kvec iov[3];
362 
363                 hdr.ProtocolId = SMB2_COMPRESSION_TRANSFORM_ID;
364                 hdr.OriginalCompressedSegmentSize = cpu_to_le32(slen);
365                 hdr.CompressionAlgorithm = SMB3_COMPRESS_LZ77;
366                 hdr.Flags = SMB2_COMPRESSION_FLAG_NONE;
367                 hdr.Offset = cpu_to_le32(rq->rq_iov[0].iov_len);
368 
369                 iov[0].iov_base = &hdr;
370                 iov[0].iov_len = sizeof(hdr);
371                 iov[1] = rq->rq_iov[0];
372                 iov[2].iov_base = dst;
373                 iov[2].iov_len = dlen;
374 
375                 comp_rq.rq_iov = iov;
376 
377                 ret = send_fn(server, 1, &comp_rq);
378         } else if (ret == -EMSGSIZE || dlen >= slen) {
379                 ret = send_fn(server, 1, rq);
380         }
381 err_free:
382         kvfree(dst);
383         kvfree(src);
384 
385         return ret;
386 }
387 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php