Paul Eggert <eggert@HIDDEN>
to control <at> debbugs.gnu.org.
Full text available.Received: (at 41535) by debbugs.gnu.org; 29 Aug 2021 08:48:12 +0000 From debbugs-submit-bounces <at> debbugs.gnu.org Sun Aug 29 04:48:12 2021 Received: from localhost ([127.0.0.1]:55481 helo=debbugs.gnu.org) by debbugs.gnu.org with esmtp (Exim 4.84_2) (envelope-from <debbugs-submit-bounces <at> debbugs.gnu.org>) id 1mKGU4-0005fw-Se for submit <at> debbugs.gnu.org; Sun, 29 Aug 2021 04:48:12 -0400 Received: from mail-wr1-f44.google.com ([209.85.221.44]:34382) by debbugs.gnu.org with esmtp (Exim 4.84_2) (envelope-from <meyering@HIDDEN>) id 1mKGTz-0005fN-T6 for 41535 <at> debbugs.gnu.org; Sun, 29 Aug 2021 04:48:08 -0400 Received: by mail-wr1-f44.google.com with SMTP id h13so17611928wrp.1 for <41535 <at> debbugs.gnu.org>; Sun, 29 Aug 2021 01:48:03 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc:content-transfer-encoding; bh=pTgvO80U3R8U7a3GILKksocEMsWiqzF8wn7g3b4PYu8=; b=N5HWVSnYafnu4EEL0ViUOsdwXV8iGgrPI7sx9+to7JkqikIQnq0hEf8lT61tV0QexU /AH4M/5owNZifSUasl5wORNYOJmRY9ZSGRJP5SAfVt06H2AC4yF9PeIx3ugl0BgISu1r jhcgB1JwuFziSeVM10VUVKYSpZjRg74lWyxmdDYpnAvJLwXg3yqrNHi6Vxf/9qIyPG7f GxQeDxhtnVzdiDVwhBqxAzGLymFgb3eZjntGvERljTbbSzS9CtGhXvGtOMxdjqidSuWh 4z5tEbxpBowVS7NQ9keqgQztsPLdDKt5SXJRMT+rOcWCg1YANFb7iOGjXOSoCSDUt0cU TShA== X-Gm-Message-State: AOAM533NbaopG1k/LTKz1hzOEh2K3ShcxiXLKZtPTqdlrElvJPW9Piv/ 86vVwM15Iu1+B8KtVsKMIVAUucO3y5oTqLCToz0= X-Google-Smtp-Source: ABdhPJz6wPyKp69/1OOVWpO3ykD3/y6G+fPMn+PonG01u68jvhfRV31xyTiNy/5n9LUYaAwb7qt9+vvlMVUtg+oJy1A= X-Received: by 2002:a5d:6cab:: with SMTP id a11mr16752642wra.287.1630226877943; Sun, 29 Aug 2021 01:47:57 -0700 (PDT) MIME-Version: 1.0 References: <20200526023940.1967-1-liqiang64@HIDDEN> <ac086349-18f9-9ead-11ea-fb0b55d15974@HIDDEN> In-Reply-To: <ac086349-18f9-9ead-11ea-fb0b55d15974@HIDDEN> From: Jim Meyering <jim@HIDDEN> Date: Sun, 29 Aug 2021 10:47:45 +0200 Message-ID: <CA+8g5KGOQ-r4o+mV11DuJqcfYcaYPcvQNgoP7P2poWv2k5K90Q@HIDDEN> Subject: Re: bug#41535: [PATCH] performance optimization for aarch64 To: Li Qiang <liqiang64@HIDDEN> Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Spam-Score: 0.5 (/) X-Debbugs-Envelope-To: 41535 Cc: luanjianhai@HIDDEN, Paul Eggert <eggert@HIDDEN>, sangyan@HIDDEN, colordev.jiang@HIDDEN, luchunhua@HIDDEN, 41535 <at> debbugs.gnu.org, huxinwei@HIDDEN, Jim Meyering <meyering@HIDDEN> X-BeenThere: debbugs-submit <at> debbugs.gnu.org X-Mailman-Version: 2.1.18 Precedence: list List-Id: <debbugs-submit.debbugs.gnu.org> List-Unsubscribe: <https://debbugs.gnu.org/cgi-bin/mailman/options/debbugs-submit>, <mailto:debbugs-submit-request <at> debbugs.gnu.org?subject=unsubscribe> List-Archive: <https://debbugs.gnu.org/cgi-bin/mailman/private/debbugs-submit/> List-Post: <mailto:debbugs-submit <at> debbugs.gnu.org> List-Help: <mailto:debbugs-submit-request <at> debbugs.gnu.org?subject=help> List-Subscribe: <https://debbugs.gnu.org/cgi-bin/mailman/listinfo/debbugs-submit>, <mailto:debbugs-submit-request <at> debbugs.gnu.org?subject=subscribe> Errors-To: debbugs-submit-bounces <at> debbugs.gnu.org Sender: "Debbugs-submit" <debbugs-submit-bounces <at> debbugs.gnu.org> X-Spam-Score: -0.5 (/) On Sat, May 30, 2020 at 11:19 AM Li Qiang <liqiang64@HIDDEN> wrote: > =E5=9C=A8 2020/5/26 10:39, l00374334 =E5=86=99=E9=81=93: > > From: liqiang <liqiang64@HIDDEN> > > > > By analyzing the compression and decompression process of gzip, I found > > > > that the hot spots of CRC32 and longest_match function are very high. > > > > > > > > On the aarch64 architecture, we can optimize the efficiency of crc32 > > > > through the interface provided by the neon instruction set (12x faster > > > > in aarch64), and optimize the performance of random access code through > > > > prefetch instructions (about 5%~8% improvement). In some compression > > > > scenarios, loop expansion can also get a certain performance improvemen= t > > > > (about 10%). > > > > > > > > Modify by Li Qiang. > > > > --- > > configure | 14 ++++++++++++++ > > deflate.c | 30 +++++++++++++++++++++++++++++- > > util.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ Thank you for that work and sorry for the delay in responding. However, for now I prefer not to apply it. I'd prefer to see arch-specific optimizations added to libz in the hope (perhaps naive) that someone will find time to make gzip use libz.
bug-gzip@HIDDEN:bug#41535; Package gzip.
Full text available.
Received: (at 41535) by debbugs.gnu.org; 20 Aug 2020 08:55:42 +0000
From debbugs-submit-bounces <at> debbugs.gnu.org Thu Aug 20 04:55:42 2020
Received: from localhost ([127.0.0.1]:41602 helo=debbugs.gnu.org)
by debbugs.gnu.org with esmtp (Exim 4.84_2)
(envelope-from <debbugs-submit-bounces <at> debbugs.gnu.org>)
id 1k8gMH-0004AS-Qx
for submit <at> debbugs.gnu.org; Thu, 20 Aug 2020 04:55:42 -0400
Received: from szxga06-in.huawei.com ([45.249.212.32]:56900 helo=huawei.com)
by debbugs.gnu.org with esmtp (Exim 4.84_2)
(envelope-from <liqiang64@HIDDEN>) id 1k8gMC-0004AA-RR
for 41535 <at> debbugs.gnu.org; Thu, 20 Aug 2020 04:55:40 -0400
Received: from DGGEMS404-HUB.china.huawei.com (unknown [172.30.72.58])
by Forcepoint Email with ESMTP id 8B77EE9DAA5AC3580D5D;
Thu, 20 Aug 2020 16:55:28 +0800 (CST)
Received: from [127.0.0.1] (10.108.234.107) by DGGEMS404-HUB.china.huawei.com
(10.3.19.204) with Microsoft SMTP Server id 14.3.487.0;
Thu, 20 Aug 2020 16:55:27 +0800
Subject: Re: bug#41535: [PATCH] performance optimization for aarch64
From: Li Qiang <liqiang64@HIDDEN>
To: <41535 <at> debbugs.gnu.org>
References: <20200526023940.1967-1-liqiang64@HIDDEN>
<ac086349-18f9-9ead-11ea-fb0b55d15974@HIDDEN>
Message-ID: <5e6aa834-6dc0-4465-5807-088eb53abd05@HIDDEN>
Date: Thu, 20 Aug 2020 16:55:26 +0800
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101
Thunderbird/68.8.1
MIME-Version: 1.0
In-Reply-To: <ac086349-18f9-9ead-11ea-fb0b55d15974@HIDDEN>
Content-Type: text/plain; charset="gbk"
Content-Transfer-Encoding: 8bit
X-Originating-IP: [10.108.234.107]
X-CFilter-Loop: Reflected
X-Spam-Score: -3.7 (---)
X-Debbugs-Envelope-To: 41535
Cc: meyering@HIDDEN, eggert@HIDDEN
X-BeenThere: debbugs-submit <at> debbugs.gnu.org
X-Mailman-Version: 2.1.18
Precedence: list
List-Id: <debbugs-submit.debbugs.gnu.org>
List-Unsubscribe: <https://debbugs.gnu.org/cgi-bin/mailman/options/debbugs-submit>,
<mailto:debbugs-submit-request <at> debbugs.gnu.org?subject=unsubscribe>
List-Archive: <https://debbugs.gnu.org/cgi-bin/mailman/private/debbugs-submit/>
List-Post: <mailto:debbugs-submit <at> debbugs.gnu.org>
List-Help: <mailto:debbugs-submit-request <at> debbugs.gnu.org?subject=help>
List-Subscribe: <https://debbugs.gnu.org/cgi-bin/mailman/listinfo/debbugs-submit>,
<mailto:debbugs-submit-request <at> debbugs.gnu.org?subject=subscribe>
Errors-To: debbugs-submit-bounces <at> debbugs.gnu.org
Sender: "Debbugs-submit" <debbugs-submit-bounces <at> debbugs.gnu.org>
X-Spam-Score: -4.7 (----)
在 2020/5/30 17:17, Li Qiang 写道:
>
>
> 在 2020/5/26 10:39, l00374334 写道:
>> From: liqiang <liqiang64@HIDDEN>
>>
>> By analyzing the compression and decompression process of gzip, I found
>>
>> that the hot spots of CRC32 and longest_match function are very high.
>>
>>
>>
>> On the aarch64 architecture, we can optimize the efficiency of crc32
>>
>> through the interface provided by the neon instruction set (12x faster
>>
>> in aarch64), and optimize the performance of random access code through
>>
>> prefetch instructions (about 5%~8% improvement). In some compression
>>
>> scenarios, loop expansion can also get a certain performance improvement
>>
>> (about 10%).
>>
>>
>>
>> Modify by Li Qiang.
>>
>> ---
>> configure | 14 ++++++++++++++
>> deflate.c | 30 +++++++++++++++++++++++++++++-
>> util.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
>> 3 files changed, 88 insertions(+), 1 deletion(-)
>>
>> diff --git a/configure b/configure
>> index cab3daf..dc80cb6 100644
>> --- a/configure
>> +++ b/configure
>> @@ -14555,6 +14555,20 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
>> ;;
>>
>> arm* | aarch64 )
>> + cat confdefs.h - <<_ACEOF >conftest.$ac_ext
>> +/* end confdefs.h. */
>> +#if defined __ARM_NEON__ || defined __ARM_NEON
>> + int ok;
>> + #else
>> + error fail
>> + #endif
>> +
>> +_ACEOF
>> +if ac_fn_c_try_compile "$LINENO"
>> +then :
>> + CFLAGS="$CFLAGS -march=armv8-a+crc"
>> +fi
>> +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
>> # Assume arm with EABI.
>> # On arm64 systems, the C compiler may be generating code in one of
>> # these ABIs:
>> diff --git a/deflate.c b/deflate.c
>> index 9d379e9..ee77ffd 100644
>> --- a/deflate.c
>> +++ b/deflate.c
>> @@ -378,6 +378,9 @@ longest_match(IPos cur_match)
>> register int len; /* length of current match */
>>
>> int best_len = prev_length; /* best match length so far */
>>
>> IPos limit = strstart > (IPos)MAX_DIST ? strstart - (IPos)MAX_DIST : NIL;
>>
>> +#ifdef __aarch64__
>>
>> + IPos next_match;
>>
>> +#endif
>>
>> /* Stop when cur_match becomes <= limit. To simplify the code,
>>
>> * we prevent matches with the string of window index 0.
>>
>> */
>>
>> @@ -411,6 +414,10 @@ longest_match(IPos cur_match)
>> do {
>>
>> Assert(cur_match < strstart, "no future");
>>
>> match = window + cur_match;
>>
>> +#ifdef __aarch64__
>>
>> + next_match = prev[cur_match & WMASK];
>>
>> + __asm__("PRFM PLDL1STRM, [%0]"::"r"(&(prev[next_match & WMASK])));
>>
>> +#endif
>>
>>
>>
>> /* Skip to next match if the match length cannot increase
>>
>> * or if the match length is less than 2:
>>
>> @@ -488,8 +495,14 @@ longest_match(IPos cur_match)
>> scan_end = scan[best_len];
>>
>> #endif
>>
>> }
>>
>> - } while ((cur_match = prev[cur_match & WMASK]) > limit
>>
>> + }
>>
>> +#ifdef __aarch64__
>>
>> + while ((cur_match = next_match) > limit
>>
>> + && --chain_length != 0);
>>
>> +#else
>>
>> + while ((cur_match = prev[cur_match & WMASK]) > limit
>>
>> && --chain_length != 0);
>>
>> +#endif
>>
>>
>>
>> return best_len;
>>
>> }
>>
>> @@ -777,7 +790,22 @@ deflate (int pack_level)
>> lookahead -= prev_length-1;
>>
>> prev_length -= 2;
>>
>> RSYNC_ROLL(strstart, prev_length+1);
>>
>> + while (prev_length >= 4) {
>>
>> + /* After actual verification, expanding this loop
>>
>> + * can improve its performance in certain scenarios.
>>
>> + */
>>
>> + prev_length -= 4;
>>
>> + strstart++;
>>
>> + INSERT_STRING(strstart, hash_head);
>>
>> + strstart++;
>>
>> + INSERT_STRING(strstart, hash_head);
>>
>> + strstart++;
>>
>> + INSERT_STRING(strstart, hash_head);
>>
>> + strstart++;
>>
>> + INSERT_STRING(strstart, hash_head);
>>
>> + }
>>
>> do {
>>
>> + if (prev_length == 0) break;
>>
>> strstart++;
>>
>> INSERT_STRING(strstart, hash_head);
>>
>> /* strstart never exceeds WSIZE-MAX_MATCH, so there are
>>
>> diff --git a/util.c b/util.c
>> index 0a0fc21..c9f0e52 100644
>> --- a/util.c
>> +++ b/util.c
>> @@ -38,6 +38,12 @@
>>
>>
>> static int write_buffer (int, voidp, unsigned int);
>>
>>
>>
>> +#if defined __ARM_NEON__ || defined __ARM_NEON
>>
>> +#define CRC32D(crc, val) __asm__("crc32x %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(val))
>>
>> +#define CRC32W(crc, val) __asm__("crc32w %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(val))
>>
>> +#define CRC32H(crc, val) __asm__("crc32h %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(val))
>>
>> +#define CRC32B(crc, val) __asm__("crc32b %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(val))
>>
>> +#else
>>
>> /* ========================================================================
>>
>> * Table of CRC-32's of all single-byte values (made by makecrc.c)
>>
>> */
>>
>> @@ -95,6 +101,7 @@ static const ulg crc_32_tab[] = {
>> 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,
>>
>> 0x2d02ef8dL
>>
>> };
>>
>> +#endif
>>
>>
>>
>> /* Shift register contents. */
>>
>> static ulg crc = 0xffffffffL;
>>
>> @@ -132,6 +139,43 @@ ulg updcrc(s, n)
>> const uch *s; /* pointer to bytes to pump through */
>>
>> unsigned n; /* number of bytes in s[] */
>>
>> {
>>
>> +#if defined __ARM_NEON__ || defined __ARM_NEON
>>
>> + register ulg c;
>>
>> + static ulg crc = (ulg)0xffffffffL;
>>
>> + register const uint8_t *buf1;
>>
>> + register const uint16_t *buf2;
>>
>> + register const uint32_t *buf4;
>>
>> + register const uint64_t *buf8;
>>
>> + int64_t length = (int64_t)n;
>>
>> + buf8 = (const uint64_t *)(const void *)s;
>>
>> +
>>
>> + if (s == NULL) {
>>
>> + c = 0xffffffffL;
>>
>> + } else {
>>
>> + c = crc;
>>
>> + while(length >= sizeof(uint64_t)) {
>>
>> + CRC32D(c, *buf8++);
>>
>> + length -= sizeof(uint64_t);
>>
>> + }
>>
>> + buf4 = (const uint32_t *)(const void *)buf8;
>>
>> + if (length >= sizeof(uint32_t)) {
>>
>> + CRC32W(c, *buf4++);
>>
>> + length -= sizeof(uint32_t);
>>
>> + }
>>
>> + buf2 = (const uint16_t *)(const void *)buf4;
>>
>> + if(length >= sizeof(uint16_t)) {
>>
>> + CRC32H(c, *buf2++);
>>
>> + length -= sizeof(uint16_t);
>>
>> + }
>>
>> + buf1 = (const uint8_t *)(const void *)buf2;
>>
>> + if (length >= sizeof(uint8_t)) {
>>
>> + CRC32B(c, *buf1);
>>
>> + length -= sizeof(uint8_t);
>>
>> + }
>>
>> + }
>>
>> + crc = c;
>>
>> + return (c ^ 0xffffffffL);
>>
>> +#else
>>
>> register ulg c; /* temporary variable */
>>
>>
>>
>> if (s == NULL) {
>>
>> @@ -144,6 +188,7 @@ ulg updcrc(s, n)
>> }
>>
>> crc = c;
>>
>> return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */
>>
>> +#endif
>>
>> }
>>
>>
>>
>> /* Return a current CRC value. */
>>
>
> Please allow me to show a set of actual test data for this patch.
>
> First, I made an original version of the program "gzip-1.10" based
> on the gzip-1.10 source code, and then made an optimized version of
> the program "gzip-optimized" after applying my optimization patch.
>
> Next I use gzip-1.10 version to test the compression and decompression
> time on some **xml** files:
> [XML]# time ./gzip-1.10 *.xml
>
> real 0m5.099s
> user 0m4.384s
> sys 0m0.176s
> [XML]# time ./gzip-1.10 -d *.gz
>
> real 0m2.173s
> user 0m1.821s
> sys 0m0.348s
>
> Then use the optimized version to compare:
> [XML]# time ./gzip-optimized *.xml
>
> real 0m2.785s
> user 0m2.576s
> sys 0m0.204s
> [XML]# time ./gzip-optimized -d *.gz
>
> real 0m0.497s
> user 0m0.176s
> sys 0m0.320s
>
>
> The next test object is a large **log** file:
> [LOG]# time ./gzip-1.10 *.log
>
> real 0m8.883s
> user 0m8.652s
> sys 0m0.217s
> [LOG]# time ./gzip-1.10 -d *.gz
>
> real 0m3.049s
> user 0m2.604s
> sys 0m0.439s
>
> Also use the optimized version to compare:
> [LOG]# time ./gzip-optimized *.log
>
> real 0m6.882s
> user 0m6.607s
> sys 0m0.264s
> [LOG]# time ./gzip-optimized -d *.gz
>
> real 0m1.054s
> user 0m0.622s
> sys 0m0.431s
>
> The above experimental data are from the aarch64 platform.
>
Gentle ping.
: )
--
Best regards,
Li Qiang
bug-gzip@HIDDEN:bug#41535; Package gzip.
Full text available.
Received: (at 41535) by debbugs.gnu.org; 30 May 2020 09:18:12 +0000
From debbugs-submit-bounces <at> debbugs.gnu.org Sat May 30 05:18:12 2020
Received: from localhost ([127.0.0.1]:56954 helo=debbugs.gnu.org)
by debbugs.gnu.org with esmtp (Exim 4.84_2)
(envelope-from <debbugs-submit-bounces <at> debbugs.gnu.org>)
id 1jexd5-0001Z7-Hd
for submit <at> debbugs.gnu.org; Sat, 30 May 2020 05:18:11 -0400
Received: from szxga05-in.huawei.com ([45.249.212.191]:2296 helo=huawei.com)
by debbugs.gnu.org with esmtp (Exim 4.84_2)
(envelope-from <liqiang64@HIDDEN>) id 1jexd2-0001Yf-RA
for 41535 <at> debbugs.gnu.org; Sat, 30 May 2020 05:18:09 -0400
Received: from DGGEMS409-HUB.china.huawei.com (unknown [172.30.72.60])
by Forcepoint Email with ESMTP id 2696B6668135339376EC;
Sat, 30 May 2020 17:18:01 +0800 (CST)
Received: from [127.0.0.1] (10.108.222.92) by DGGEMS409-HUB.china.huawei.com
(10.3.19.209) with Microsoft SMTP Server id 14.3.487.0; Sat, 30 May 2020
17:17:51 +0800
Subject: bug#41535: [PATCH] performance optimization for aarch64
To: <41535 <at> debbugs.gnu.org>
References: <20200526023940.1967-1-liqiang64@HIDDEN>
From: Li Qiang <liqiang64@HIDDEN>
Message-ID: <ac086349-18f9-9ead-11ea-fb0b55d15974@HIDDEN>
Date: Sat, 30 May 2020 17:17:49 +0800
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101
Thunderbird/68.8.1
MIME-Version: 1.0
In-Reply-To: <20200526023940.1967-1-liqiang64@HIDDEN>
Content-Type: text/plain; charset="gbk"
Content-Transfer-Encoding: 8bit
X-Originating-IP: [10.108.222.92]
X-CFilter-Loop: Reflected
X-Spam-Score: -2.3 (--)
X-Debbugs-Envelope-To: 41535
Cc: luanjianhai@HIDDEN, eggert@HIDDEN, sangyan@HIDDEN,
colordev.jiang@HIDDEN, luchunhua@HIDDEN, huxinwei@HIDDEN,
meyering@HIDDEN
X-BeenThere: debbugs-submit <at> debbugs.gnu.org
X-Mailman-Version: 2.1.18
Precedence: list
List-Id: <debbugs-submit.debbugs.gnu.org>
List-Unsubscribe: <https://debbugs.gnu.org/cgi-bin/mailman/options/debbugs-submit>,
<mailto:debbugs-submit-request <at> debbugs.gnu.org?subject=unsubscribe>
List-Archive: <https://debbugs.gnu.org/cgi-bin/mailman/private/debbugs-submit/>
List-Post: <mailto:debbugs-submit <at> debbugs.gnu.org>
List-Help: <mailto:debbugs-submit-request <at> debbugs.gnu.org?subject=help>
List-Subscribe: <https://debbugs.gnu.org/cgi-bin/mailman/listinfo/debbugs-submit>,
<mailto:debbugs-submit-request <at> debbugs.gnu.org?subject=subscribe>
Errors-To: debbugs-submit-bounces <at> debbugs.gnu.org
Sender: "Debbugs-submit" <debbugs-submit-bounces <at> debbugs.gnu.org>
X-Spam-Score: -3.3 (---)
在 2020/5/26 10:39, l00374334 写道:
> From: liqiang <liqiang64@HIDDEN>
>
> By analyzing the compression and decompression process of gzip, I found
>
> that the hot spots of CRC32 and longest_match function are very high.
>
>
>
> On the aarch64 architecture, we can optimize the efficiency of crc32
>
> through the interface provided by the neon instruction set (12x faster
>
> in aarch64), and optimize the performance of random access code through
>
> prefetch instructions (about 5%~8% improvement). In some compression
>
> scenarios, loop expansion can also get a certain performance improvement
>
> (about 10%).
>
>
>
> Modify by Li Qiang.
>
> ---
> configure | 14 ++++++++++++++
> deflate.c | 30 +++++++++++++++++++++++++++++-
> util.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 88 insertions(+), 1 deletion(-)
>
> diff --git a/configure b/configure
> index cab3daf..dc80cb6 100644
> --- a/configure
> +++ b/configure
> @@ -14555,6 +14555,20 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
> ;;
>
> arm* | aarch64 )
> + cat confdefs.h - <<_ACEOF >conftest.$ac_ext
> +/* end confdefs.h. */
> +#if defined __ARM_NEON__ || defined __ARM_NEON
> + int ok;
> + #else
> + error fail
> + #endif
> +
> +_ACEOF
> +if ac_fn_c_try_compile "$LINENO"
> +then :
> + CFLAGS="$CFLAGS -march=armv8-a+crc"
> +fi
> +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
> # Assume arm with EABI.
> # On arm64 systems, the C compiler may be generating code in one of
> # these ABIs:
> diff --git a/deflate.c b/deflate.c
> index 9d379e9..ee77ffd 100644
> --- a/deflate.c
> +++ b/deflate.c
> @@ -378,6 +378,9 @@ longest_match(IPos cur_match)
> register int len; /* length of current match */
>
> int best_len = prev_length; /* best match length so far */
>
> IPos limit = strstart > (IPos)MAX_DIST ? strstart - (IPos)MAX_DIST : NIL;
>
> +#ifdef __aarch64__
>
> + IPos next_match;
>
> +#endif
>
> /* Stop when cur_match becomes <= limit. To simplify the code,
>
> * we prevent matches with the string of window index 0.
>
> */
>
> @@ -411,6 +414,10 @@ longest_match(IPos cur_match)
> do {
>
> Assert(cur_match < strstart, "no future");
>
> match = window + cur_match;
>
> +#ifdef __aarch64__
>
> + next_match = prev[cur_match & WMASK];
>
> + __asm__("PRFM PLDL1STRM, [%0]"::"r"(&(prev[next_match & WMASK])));
>
> +#endif
>
>
>
> /* Skip to next match if the match length cannot increase
>
> * or if the match length is less than 2:
>
> @@ -488,8 +495,14 @@ longest_match(IPos cur_match)
> scan_end = scan[best_len];
>
> #endif
>
> }
>
> - } while ((cur_match = prev[cur_match & WMASK]) > limit
>
> + }
>
> +#ifdef __aarch64__
>
> + while ((cur_match = next_match) > limit
>
> + && --chain_length != 0);
>
> +#else
>
> + while ((cur_match = prev[cur_match & WMASK]) > limit
>
> && --chain_length != 0);
>
> +#endif
>
>
>
> return best_len;
>
> }
>
> @@ -777,7 +790,22 @@ deflate (int pack_level)
> lookahead -= prev_length-1;
>
> prev_length -= 2;
>
> RSYNC_ROLL(strstart, prev_length+1);
>
> + while (prev_length >= 4) {
>
> + /* After actual verification, expanding this loop
>
> + * can improve its performance in certain scenarios.
>
> + */
>
> + prev_length -= 4;
>
> + strstart++;
>
> + INSERT_STRING(strstart, hash_head);
>
> + strstart++;
>
> + INSERT_STRING(strstart, hash_head);
>
> + strstart++;
>
> + INSERT_STRING(strstart, hash_head);
>
> + strstart++;
>
> + INSERT_STRING(strstart, hash_head);
>
> + }
>
> do {
>
> + if (prev_length == 0) break;
>
> strstart++;
>
> INSERT_STRING(strstart, hash_head);
>
> /* strstart never exceeds WSIZE-MAX_MATCH, so there are
>
> diff --git a/util.c b/util.c
> index 0a0fc21..c9f0e52 100644
> --- a/util.c
> +++ b/util.c
> @@ -38,6 +38,12 @@
>
>
> static int write_buffer (int, voidp, unsigned int);
>
>
>
> +#if defined __ARM_NEON__ || defined __ARM_NEON
>
> +#define CRC32D(crc, val) __asm__("crc32x %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(val))
>
> +#define CRC32W(crc, val) __asm__("crc32w %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(val))
>
> +#define CRC32H(crc, val) __asm__("crc32h %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(val))
>
> +#define CRC32B(crc, val) __asm__("crc32b %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(val))
>
> +#else
>
> /* ========================================================================
>
> * Table of CRC-32's of all single-byte values (made by makecrc.c)
>
> */
>
> @@ -95,6 +101,7 @@ static const ulg crc_32_tab[] = {
> 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,
>
> 0x2d02ef8dL
>
> };
>
> +#endif
>
>
>
> /* Shift register contents. */
>
> static ulg crc = 0xffffffffL;
>
> @@ -132,6 +139,43 @@ ulg updcrc(s, n)
> const uch *s; /* pointer to bytes to pump through */
>
> unsigned n; /* number of bytes in s[] */
>
> {
>
> +#if defined __ARM_NEON__ || defined __ARM_NEON
>
> + register ulg c;
>
> + static ulg crc = (ulg)0xffffffffL;
>
> + register const uint8_t *buf1;
>
> + register const uint16_t *buf2;
>
> + register const uint32_t *buf4;
>
> + register const uint64_t *buf8;
>
> + int64_t length = (int64_t)n;
>
> + buf8 = (const uint64_t *)(const void *)s;
>
> +
>
> + if (s == NULL) {
>
> + c = 0xffffffffL;
>
> + } else {
>
> + c = crc;
>
> + while(length >= sizeof(uint64_t)) {
>
> + CRC32D(c, *buf8++);
>
> + length -= sizeof(uint64_t);
>
> + }
>
> + buf4 = (const uint32_t *)(const void *)buf8;
>
> + if (length >= sizeof(uint32_t)) {
>
> + CRC32W(c, *buf4++);
>
> + length -= sizeof(uint32_t);
>
> + }
>
> + buf2 = (const uint16_t *)(const void *)buf4;
>
> + if(length >= sizeof(uint16_t)) {
>
> + CRC32H(c, *buf2++);
>
> + length -= sizeof(uint16_t);
>
> + }
>
> + buf1 = (const uint8_t *)(const void *)buf2;
>
> + if (length >= sizeof(uint8_t)) {
>
> + CRC32B(c, *buf1);
>
> + length -= sizeof(uint8_t);
>
> + }
>
> + }
>
> + crc = c;
>
> + return (c ^ 0xffffffffL);
>
> +#else
>
> register ulg c; /* temporary variable */
>
>
>
> if (s == NULL) {
>
> @@ -144,6 +188,7 @@ ulg updcrc(s, n)
> }
>
> crc = c;
>
> return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) */
>
> +#endif
>
> }
>
>
>
> /* Return a current CRC value. */
>
Please allow me to show a set of actual test data for this patch.
First, I made an original version of the program "gzip-1.10" based
on the gzip-1.10 source code, and then made an optimized version of
the program "gzip-optimized" after applying my optimization patch.
Next I use gzip-1.10 version to test the compression and decompression
time on some **xml** files:
[XML]# time ./gzip-1.10 *.xml
real 0m5.099s
user 0m4.384s
sys 0m0.176s
[XML]# time ./gzip-1.10 -d *.gz
real 0m2.173s
user 0m1.821s
sys 0m0.348s
Then use the optimized version to compare:
[XML]# time ./gzip-optimized *.xml
real 0m2.785s
user 0m2.576s
sys 0m0.204s
[XML]# time ./gzip-optimized -d *.gz
real 0m0.497s
user 0m0.176s
sys 0m0.320s
The next test object is a large **log** file:
[LOG]# time ./gzip-1.10 *.log
real 0m8.883s
user 0m8.652s
sys 0m0.217s
[LOG]# time ./gzip-1.10 -d *.gz
real 0m3.049s
user 0m2.604s
sys 0m0.439s
Also use the optimized version to compare:
[LOG]# time ./gzip-optimized *.log
real 0m6.882s
user 0m6.607s
sys 0m0.264s
[LOG]# time ./gzip-optimized -d *.gz
real 0m1.054s
user 0m0.622s
sys 0m0.431s
The above experimental data are from the aarch64 platform.
--
Best regards,
Li Qiang
bug-gzip@HIDDEN:bug#41535; Package gzip.
Full text available.
Received: (at submit) by debbugs.gnu.org; 26 May 2020 05:17:59 +0000
From debbugs-submit-bounces <at> debbugs.gnu.org Tue May 26 01:17:59 2020
Received: from localhost ([127.0.0.1]:43369 helo=debbugs.gnu.org)
by debbugs.gnu.org with esmtp (Exim 4.84_2)
(envelope-from <debbugs-submit-bounces <at> debbugs.gnu.org>)
id 1jdRyQ-0002db-Be
for submit <at> debbugs.gnu.org; Tue, 26 May 2020 01:17:58 -0400
Received: from lists.gnu.org ([209.51.188.17]:56748)
by debbugs.gnu.org with esmtp (Exim 4.84_2)
(envelope-from <liqiang64@HIDDEN>) id 1jdPVi-0002uY-He
for submit <at> debbugs.gnu.org; Mon, 25 May 2020 22:40:11 -0400
Received: from eggs.gnu.org ([2001:470:142:3::10]:41510)
by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256)
(Exim 4.90_1) (envelope-from <liqiang64@HIDDEN>)
id 1jdPVi-0005rV-BR
for bug-gzip@HIDDEN; Mon, 25 May 2020 22:40:10 -0400
Received: from szxga06-in.huawei.com ([45.249.212.32]:33558 helo=huawei.com)
by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256)
(Exim 4.90_1) (envelope-from <liqiang64@HIDDEN>)
id 1jdPVg-0000f4-DY
for bug-gzip@HIDDEN; Mon, 25 May 2020 22:40:09 -0400
Received: from DGGEMS402-HUB.china.huawei.com (unknown [172.30.72.60])
by Forcepoint Email with ESMTP id 8CC6788B7A8D9320AB1D;
Tue, 26 May 2020 10:39:52 +0800 (CST)
Received: from huawei.com (10.108.222.92) by DGGEMS402-HUB.china.huawei.com
(10.3.19.202) with Microsoft SMTP Server id 14.3.487.0; Tue, 26 May 2020
10:39:42 +0800
From: l00374334 <liqiang64@HIDDEN>
To: <bug-gzip@HIDDEN>, <eggert@HIDDEN>
Subject: [PATCH] performance optimization for aarch64
Date: Tue, 26 May 2020 10:39:40 +0800
Message-ID: <20200526023940.1967-1-liqiang64@HIDDEN>
X-Mailer: git-send-email 2.23.0.windows.1
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain
X-Originating-IP: [10.108.222.92]
X-CFilter-Loop: Reflected
Received-SPF: pass client-ip=45.249.212.32; envelope-from=liqiang64@HIDDEN;
helo=huawei.com
X-detected-operating-system: by eggs.gnu.org: First seen = 2020/05/25 22:39:53
X-ACL-Warn: Detected OS = Linux 3.11 and newer [fuzzy]
X-Spam_score_int: -41
X-Spam_score: -4.2
X-Spam_bar: ----
X-Spam_report: (-4.2 / 5.0 requ) BAYES_00=-1.9, RCVD_IN_DNSWL_MED=-2.3,
RCVD_IN_MSPIKE_H4=0.001, RCVD_IN_MSPIKE_WL=0.001, SPF_HELO_PASS=-0.001,
SPF_PASS=-0.001, URIBL_BLOCKED=0.001 autolearn=_AUTOLEARN
X-Spam_action: no action
X-Spam-Score: -1.4 (-)
X-Debbugs-Envelope-To: submit
X-Mailman-Approved-At: Tue, 26 May 2020 01:17:56 -0400
Cc: luanjianhai@HIDDEN, liqiang64@HIDDEN, sangyan@HIDDEN,
luchunhua@HIDDEN
X-BeenThere: debbugs-submit <at> debbugs.gnu.org
X-Mailman-Version: 2.1.18
Precedence: list
List-Id: <debbugs-submit.debbugs.gnu.org>
List-Unsubscribe: <https://debbugs.gnu.org/cgi-bin/mailman/options/debbugs-submit>,
<mailto:debbugs-submit-request <at> debbugs.gnu.org?subject=unsubscribe>
List-Archive: <https://debbugs.gnu.org/cgi-bin/mailman/private/debbugs-submit/>
List-Post: <mailto:debbugs-submit <at> debbugs.gnu.org>
List-Help: <mailto:debbugs-submit-request <at> debbugs.gnu.org?subject=help>
List-Subscribe: <https://debbugs.gnu.org/cgi-bin/mailman/listinfo/debbugs-submit>,
<mailto:debbugs-submit-request <at> debbugs.gnu.org?subject=subscribe>
Errors-To: debbugs-submit-bounces <at> debbugs.gnu.org
Sender: "Debbugs-submit" <debbugs-submit-bounces <at> debbugs.gnu.org>
X-Spam-Score: -2.4 (--)
From: liqiang <liqiang64@HIDDEN>
By analyzing the compression and decompression process of gzip, I found =0D
that the hot spots of CRC32 and longest_match function are very high.=0D
=0D
On the aarch64 architecture, we can optimize the efficiency of crc32 =0D
through the interface provided by the neon instruction set (12x faster =0D
in aarch64), and optimize the performance of random access code through =0D
prefetch instructions (about 5%~8% improvement). In some compression =0D
scenarios, loop expansion can also get a certain performance improvement =0D
(about 10%).=0D
=0D
Modify by Li Qiang.
---
configure | 14 ++++++++++++++
deflate.c | 30 +++++++++++++++++++++++++++++-
util.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 88 insertions(+), 1 deletion(-)
diff --git a/configure b/configure
index cab3daf..dc80cb6 100644
--- a/configure
+++ b/configure
@@ -14555,6 +14555,20 @@ rm -f core conftest.err conftest.$ac_objext confte=
st.$ac_ext
;;
=20
arm* | aarch64 )
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+#if defined __ARM_NEON__ || defined __ARM_NEON
+ int ok;
+ #else
+ error fail
+ #endif
+
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"
+then :
+ CFLAGS=3D"$CFLAGS -march=3Darmv8-a+crc"
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
# Assume arm with EABI.
# On arm64 systems, the C compiler may be generating code in on=
e of
# these ABIs:
diff --git a/deflate.c b/deflate.c
index 9d379e9..ee77ffd 100644
--- a/deflate.c
+++ b/deflate.c
@@ -378,6 +378,9 @@ longest_match(IPos cur_match)
register int len; /* length of current match=
*/=0D
int best_len =3D prev_length; /* best match length so =
far */=0D
IPos limit =3D strstart > (IPos)MAX_DIST ? strstart - (IPos)MAX_DIST :=
NIL;=0D
+#ifdef __aarch64__=0D
+ IPos next_match;=0D
+#endif=0D
/* Stop when cur_match becomes <=3D limit. To simplify the code,=0D
* we prevent matches with the string of window index 0.=0D
*/=0D
@@ -411,6 +414,10 @@ longest_match(IPos cur_match)
do {=0D
Assert(cur_match < strstart, "no future");=0D
match =3D window + cur_match;=0D
+#ifdef __aarch64__=0D
+ next_match =3D prev[cur_match & WMASK];=0D
+ __asm__("PRFM PLDL1STRM, [%0]"::"r"(&(prev[next_match & WMASK])));=
=0D
+#endif=0D
=0D
/* Skip to next match if the match length cannot increase=0D
* or if the match length is less than 2:=0D
@@ -488,8 +495,14 @@ longest_match(IPos cur_match)
scan_end =3D scan[best_len];=0D
#endif=0D
}=0D
- } while ((cur_match =3D prev[cur_match & WMASK]) > limit=0D
+ }=0D
+#ifdef __aarch64__=0D
+ while ((cur_match =3D next_match) > limit=0D
+ && --chain_length !=3D 0);=0D
+#else=0D
+ while ((cur_match =3D prev[cur_match & WMASK]) > limit=0D
&& --chain_length !=3D 0);=0D
+#endif=0D
=0D
return best_len;=0D
}=0D
@@ -777,7 +790,22 @@ deflate (int pack_level)
lookahead -=3D prev_length-1;=0D
prev_length -=3D 2;=0D
RSYNC_ROLL(strstart, prev_length+1);=0D
+ while (prev_length >=3D 4) {=0D
+ /* After actual verification, expanding this loop=0D
+ * can improve its performance in certain scenarios.=0D
+ */=0D
+ prev_length -=3D 4;=0D
+ strstart++;=0D
+ INSERT_STRING(strstart, hash_head);=0D
+ strstart++;=0D
+ INSERT_STRING(strstart, hash_head);=0D
+ strstart++;=0D
+ INSERT_STRING(strstart, hash_head);=0D
+ strstart++;=0D
+ INSERT_STRING(strstart, hash_head);=0D
+ }=0D
do {=0D
+ if (prev_length =3D=3D 0) break;=0D
strstart++;=0D
INSERT_STRING(strstart, hash_head);=0D
/* strstart never exceeds WSIZE-MAX_MATCH, so there are=0D
diff --git a/util.c b/util.c
index 0a0fc21..c9f0e52 100644
--- a/util.c
+++ b/util.c
@@ -38,6 +38,12 @@
=0D
static int write_buffer (int, voidp, unsigned int);=0D
=0D
+#if defined __ARM_NEON__ || defined __ARM_NEON=0D
+#define CRC32D(crc, val) __asm__("crc32x %w[c], %w[c], %x[v]":[c]"+r"(crc)=
:[v]"r"(val))=0D
+#define CRC32W(crc, val) __asm__("crc32w %w[c], %w[c], %w[v]":[c]"+r"(crc)=
:[v]"r"(val))=0D
+#define CRC32H(crc, val) __asm__("crc32h %w[c], %w[c], %w[v]":[c]"+r"(crc)=
:[v]"r"(val))=0D
+#define CRC32B(crc, val) __asm__("crc32b %w[c], %w[c], %w[v]":[c]"+r"(crc)=
:[v]"r"(val))=0D
+#else=0D
/* =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=0D
* Table of CRC-32's of all single-byte values (made by makecrc.c)=0D
*/=0D
@@ -95,6 +101,7 @@ static const ulg crc_32_tab[] =3D {
0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,=0D
0x2d02ef8dL=0D
};=0D
+#endif=0D
=0D
/* Shift register contents. */=0D
static ulg crc =3D 0xffffffffL;=0D
@@ -132,6 +139,43 @@ ulg updcrc(s, n)
const uch *s; /* pointer to bytes to pump through */=0D
unsigned n; /* number of bytes in s[] */=0D
{=0D
+#if defined __ARM_NEON__ || defined __ARM_NEON=0D
+ register ulg c;=0D
+ static ulg crc =3D (ulg)0xffffffffL;=0D
+ register const uint8_t *buf1;=0D
+ register const uint16_t *buf2;=0D
+ register const uint32_t *buf4;=0D
+ register const uint64_t *buf8;=0D
+ int64_t length =3D (int64_t)n;=0D
+ buf8 =3D (const uint64_t *)(const void *)s;=0D
+=0D
+ if (s =3D=3D NULL) {=0D
+ c =3D 0xffffffffL;=0D
+ } else {=0D
+ c =3D crc;=0D
+ while(length >=3D sizeof(uint64_t)) {=0D
+ CRC32D(c, *buf8++);=0D
+ length -=3D sizeof(uint64_t);=0D
+ }=0D
+ buf4 =3D (const uint32_t *)(const void *)buf8;=0D
+ if (length >=3D sizeof(uint32_t)) {=0D
+ CRC32W(c, *buf4++);=0D
+ length -=3D sizeof(uint32_t);=0D
+ }=0D
+ buf2 =3D (const uint16_t *)(const void *)buf4;=0D
+ if(length >=3D sizeof(uint16_t)) {=0D
+ CRC32H(c, *buf2++);=0D
+ length -=3D sizeof(uint16_t);=0D
+ }=0D
+ buf1 =3D (const uint8_t *)(const void *)buf2;=0D
+ if (length >=3D sizeof(uint8_t)) {=0D
+ CRC32B(c, *buf1);=0D
+ length -=3D sizeof(uint8_t);=0D
+ }=0D
+ }=0D
+ crc =3D c;=0D
+ return (c ^ 0xffffffffL);=0D
+#else=0D
register ulg c; /* temporary variable */=0D
=0D
if (s =3D=3D NULL) {=0D
@@ -144,6 +188,7 @@ ulg updcrc(s, n)
}=0D
crc =3D c;=0D
return c ^ 0xffffffffL; /* (instead of ~c for 64-bit machines) *=
/=0D
+#endif=0D
}=0D
=0D
/* Return a current CRC value. */=0D
--=20
2.17.1
l00374334 <liqiang64@HIDDEN>:bug-gzip@HIDDEN.
Full text available.bug-gzip@HIDDEN:bug#41535; Package gzip.
Full text available.
GNU bug tracking system
Copyright (C) 1999 Darren O. Benham,
1997 nCipher Corporation Ltd,
1994-97 Ian Jackson.