| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| #include <config.h> |
|
|
| #include "cksum.h" |
|
|
| #include <stdio.h> |
| #include <sys/types.h> |
| #include <arm_neon.h> |
| #include "system.h" |
|
|
| |
| #define BUFLEN (1 << 16) |
|
|
| static uint64x2_t |
| bswap_neon (uint64x2_t in) |
| { |
| uint64x2_t a = |
| vreinterpretq_u64_u8 (vrev64q_u8 (vreinterpretq_u8_u64 (in))); |
| a = vcombine_u64 (vget_high_u64 (a), vget_low_u64 (a)); |
| return a; |
| } |
|
|
| |
|
|
| bool |
| cksum_vmull (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out) |
| { |
| uint64x2_t buf[BUFLEN / sizeof (uint64x2_t)]; |
| uint_fast32_t crc = 0; |
| uintmax_t length = 0; |
| size_t bytes_read; |
| poly64x2_t single_mult_constant; |
| poly64x2_t four_mult_constant; |
|
|
| if (!fp || !crc_out || !length_out) |
| return false; |
|
|
| |
| |
| |
| single_mult_constant = |
| vcombine_p64 (vcreate_p64 (0xE8A45605), vcreate_p64 (0xC5B9CD4C)); |
| four_mult_constant = |
| vcombine_p64 (vcreate_p64 (0xE6228B11), vcreate_p64 (0x8833794C)); |
|
|
| while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0) |
| { |
| uint64x2_t *datap; |
| uint64x2_t data; |
| uint64x2_t data2; |
| uint64x2_t data3; |
| uint64x2_t data4; |
| uint64x2_t data5; |
| uint64x2_t data6; |
| uint64x2_t data7; |
| uint64x2_t data8; |
| uint64x2_t fold_data; |
| uint64x2_t xor_crc; |
|
|
| if (length + bytes_read < length) |
| { |
| errno = EOVERFLOW; |
| return false; |
| } |
| length += bytes_read; |
|
|
| datap = (uint64x2_t *) buf; |
|
|
| |
| if (bytes_read >= 16 * 8) |
| { |
| data = vld1q_u64 ((uint64_t *) (datap)); |
| data = bswap_neon (data); |
| |
| |
| uint64_t wcrc = crc; |
| xor_crc = vcombine_u64 (vcreate_u64 (0), vcreate_u64 (wcrc << 32)); |
| crc = 0; |
| data = veorq_u64 (data, xor_crc); |
| data3 = vld1q_u64 ((uint64_t *) (datap + 1)); |
| data3 = bswap_neon (data3); |
| data5 = vld1q_u64 ((uint64_t *) (datap + 2)); |
| data5 = bswap_neon (data5); |
| data7 = vld1q_u64 ((uint64_t *) (datap + 3)); |
| data7 = bswap_neon (data7); |
|
|
|
|
| while (bytes_read >= 16 * 8) |
| { |
| datap += 4; |
|
|
| |
| data2 = |
| vreinterpretq_u64_p128 (vmull_p64 |
| (vgetq_lane_p64 |
| (vreinterpretq_p64_u64 (data), 0), |
| vgetq_lane_p64 (four_mult_constant, |
| 0))); |
| data = |
| vreinterpretq_u64_p128 (vmull_high_p64 |
| (vreinterpretq_p64_u64 (data), |
| four_mult_constant)); |
| data4 = |
| vreinterpretq_u64_p128 (vmull_p64 |
| (vgetq_lane_p64 |
| (vreinterpretq_p64_u64 (data3), 0), |
| vgetq_lane_p64 (four_mult_constant, |
| 0))); |
| data3 = |
| vreinterpretq_u64_p128 (vmull_high_p64 |
| (vreinterpretq_p64_u64 (data3), |
| four_mult_constant)); |
| data6 = |
| vreinterpretq_u64_p128 (vmull_p64 |
| (vgetq_lane_p64 |
| (vreinterpretq_p64_u64 (data5), 0), |
| vgetq_lane_p64 (four_mult_constant, |
| 0))); |
| data5 = |
| vreinterpretq_u64_p128 (vmull_high_p64 |
| (vreinterpretq_p64_u64 (data5), |
| four_mult_constant)); |
| data8 = |
| vreinterpretq_u64_p128 (vmull_p64 |
| (vgetq_lane_p64 |
| (vreinterpretq_p64_u64 (data7), 0), |
| vgetq_lane_p64 (four_mult_constant, |
| 0))); |
| data7 = |
| vreinterpretq_u64_p128 (vmull_high_p64 |
| (vreinterpretq_p64_u64 (data7), |
| four_mult_constant)); |
|
|
| |
| |
| |
| |
| |
| data = veorq_u64 (data, data2); |
| data2 = vld1q_u64 ((uint64_t *) (datap)); |
| data2 = bswap_neon (data2); |
| data = veorq_u64 (data, data2); |
|
|
| data3 = veorq_u64 (data3, data4); |
| data4 = vld1q_u64 ((uint64_t *) (datap + 1)); |
| data4 = bswap_neon (data4); |
| data3 = veorq_u64 (data3, data4); |
|
|
| data5 = veorq_u64 (data5, data6); |
| data6 = vld1q_u64 ((uint64_t *) (datap + 2)); |
| data6 = bswap_neon (data6); |
| data5 = veorq_u64 (data5, data6); |
|
|
| data7 = veorq_u64 (data7, data8); |
| data8 = vld1q_u64 ((uint64_t *) (datap + 3)); |
| data8 = bswap_neon (data8); |
| data7 = veorq_u64 (data7, data8); |
|
|
| bytes_read -= (16 * 4); |
| } |
| |
| |
| data = bswap_neon (data); |
| vst1q_u64 ((uint64_t *) (datap), data); |
| data3 = bswap_neon (data3); |
| vst1q_u64 ((uint64_t *) (datap + 1), data3); |
| data5 = bswap_neon (data5); |
| vst1q_u64 ((uint64_t *) (datap + 2), data5); |
| data7 = bswap_neon (data7); |
| vst1q_u64 ((uint64_t *) (datap + 3), data7); |
| } |
|
|
| |
| if (bytes_read >= 32) |
| { |
| data = vld1q_u64 ((uint64_t *) (datap)); |
| data = bswap_neon (data); |
| uint64_t wcrc = crc; |
| xor_crc = vcombine_u64 (vcreate_u64 (0), vcreate_u64 (wcrc << 32)); |
| crc = 0; |
| data = veorq_u64 (data, xor_crc); |
| while (bytes_read >= 32) |
| { |
| datap++; |
|
|
| data2 = |
| vreinterpretq_u64_p128 (vmull_p64 |
| (vgetq_lane_p64 |
| (vreinterpretq_p64_u64 (data), 0), |
| vgetq_lane_p64 (single_mult_constant, |
| 0))); |
| data = |
| vreinterpretq_u64_p128 (vmull_high_p64 |
| (vreinterpretq_p64_u64 (data), |
| single_mult_constant)); |
| fold_data = vld1q_u64 ((uint64_t *) (datap)); |
| fold_data = bswap_neon (fold_data); |
| data = veorq_u64 (data, data2); |
| data = veorq_u64 (data, fold_data); |
| bytes_read -= 16; |
| } |
| data = bswap_neon (data); |
| vst1q_u64 ((uint64_t *) (datap), data); |
| } |
|
|
| |
| unsigned char *cp = (unsigned char *) datap; |
| while (bytes_read--) |
| crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF]; |
| if (feof (fp)) |
| break; |
| } |
|
|
| *crc_out = crc; |
| *length_out = length; |
|
|
| return !ferror (fp); |
| } |
|
|