26
26
* SUCH DAMAGE.
27
27
*/
28
28
/*
29
- * ASCII magic -- file types that we know based on keywords
30
- * that can appear anywhere in the file.
29
+ * ASCII magic -- try to detect text encoding.
31
30
*
32
31
* Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
33
32
* to handle character codes other than ASCII on a unified basis.
36
35
#include "file.h"
37
36
38
37
#ifndef lint
39
- FILE_RCSID ("@(#)$File: ascmagic.c,v 1.75 2009/02/03 20:27:51 christos Exp $" )
38
+ FILE_RCSID ("@(#)$File: ascmagic.c,v 1.84 2011/12/08 12:38:24 rrt Exp $" )
40
39
#endif /* lint */
41
40
42
41
#include "magic.h"
@@ -47,13 +46,11 @@ FILE_RCSID("@(#)$File: ascmagic.c,v 1.75 2009/02/03 20:27:51 christos Exp $")
47
46
#ifdef HAVE_UNISTD_H
48
47
#include <unistd.h>
49
48
#endif
50
- #include "names.h"
51
49
52
50
#define MAXLINELEN 300 /* longest sane line length */
53
51
#define ISSPC (x ) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
54
52
|| (x) == 0x85 || (x) == '\f')
55
53
56
- private int ascmatch (const unsigned char * , const unichar * , size_t );
57
54
private unsigned char * encode_utf8 (unsigned char * , size_t , unichar * , size_t );
58
55
private size_t trim_nuls (const unsigned char * , size_t );
59
56
@@ -71,7 +68,8 @@ trim_nuls(const unsigned char *buf, size_t nbytes)
71
68
}
72
69
73
70
protected int
74
- file_ascmagic (struct magic_set * ms , const unsigned char * buf , size_t nbytes )
71
+ file_ascmagic (struct magic_set * ms , const unsigned char * buf , size_t nbytes ,
72
+ int text )
75
73
{
76
74
unichar * ubuf = NULL ;
77
75
size_t ulen ;
@@ -88,29 +86,24 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
88
86
89
87
/* If file doesn't look like any sort of text, give up. */
90
88
if (file_encoding (ms , buf , nbytes , & ubuf , & ulen , & code , & code_mime ,
91
- & type ) == 0 ) {
89
+ & type ) == 0 )
92
90
rv = 0 ;
93
- goto done ;
94
- }
91
+ else
92
+ rv = file_ascmagic_with_encoding (ms , buf , nbytes , ubuf , ulen , code ,
93
+ type , text );
95
94
96
- rv = file_ascmagic_with_encoding (ms , buf , nbytes , ubuf , ulen , code ,
97
- type );
98
-
99
- done :
100
- if (ubuf )
101
- free (ubuf );
95
+ free (ubuf );
102
96
103
97
return rv ;
104
98
}
105
99
106
100
protected int
107
101
file_ascmagic_with_encoding (struct magic_set * ms , const unsigned char * buf ,
108
102
size_t nbytes , unichar * ubuf , size_t ulen , const char * code ,
109
- const char * type )
103
+ const char * type , int text )
110
104
{
111
105
unsigned char * utf8_buf = NULL , * utf8_end ;
112
106
size_t mlen , i ;
113
- const struct names * p ;
114
107
int rv = -1 ;
115
108
int mime = ms -> flags & MAGIC_MIME ;
116
109
@@ -125,6 +118,7 @@ file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf,
125
118
int n_lf = 0 ;
126
119
int n_cr = 0 ;
127
120
int n_nel = 0 ;
121
+ int executable = 0 ;
128
122
129
123
size_t last_line_end = (size_t )-1 ;
130
124
int has_long_lines = 0 ;
@@ -140,54 +134,21 @@ file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf,
140
134
goto done ;
141
135
}
142
136
143
- /* Convert ubuf to UTF-8 and try text soft magic */
144
- /* malloc size is a conservative overestimate; could be
145
- improved, or at least realloced after conversion. */
146
- mlen = ulen * 6 ;
147
- utf8_buf = emalloc (mlen );
148
-
149
- if ((utf8_end = encode_utf8 (utf8_buf , mlen , ubuf , ulen )) == NULL )
150
- goto done ;
151
- if ((rv = file_softmagic (ms , utf8_buf , (size_t )(utf8_end - utf8_buf ),
152
- TEXTTEST )) != 0 )
153
- goto done ;
154
- else
155
- rv = -1 ;
156
-
157
- /* look for tokens from names.h - this is expensive! */
158
- if ((ms -> flags & MAGIC_NO_CHECK_TOKENS ) != 0 )
159
- goto subtype_identified ;
160
-
161
- i = 0 ;
162
- while (i < ulen ) {
163
- size_t end ;
164
-
165
- /* skip past any leading space */
166
- while (i < ulen && ISSPC (ubuf [i ]))
167
- i ++ ;
168
- if (i >= ulen )
169
- break ;
170
-
171
- /* find the next whitespace */
172
- for (end = i + 1 ; end < nbytes ; end ++ )
173
- if (ISSPC (ubuf [end ]))
174
- break ;
175
-
176
- /* compare the word thus isolated against the token list */
177
- for (p = names ; p < names + NNAMES ; p ++ ) {
178
- if (ascmatch ((const unsigned char * )p -> name , ubuf + i ,
179
- end - i )) {
180
- subtype = types [p -> type ].human ;
181
- subtype_mime = types [p -> type ].mime ;
182
- goto subtype_identified ;
183
- }
184
- }
137
+ if ((ms -> flags & MAGIC_NO_CHECK_SOFT ) == 0 ) {
138
+ /* Convert ubuf to UTF-8 and try text soft magic */
139
+ /* malloc size is a conservative overestimate; could be
140
+ improved, or at least realloced after conversion. */
141
+ mlen = ulen * 6 ;
142
+ utf8_buf = emalloc (mlen );
185
143
186
- i = end ;
144
+ if ((utf8_end = encode_utf8 (utf8_buf , mlen , ubuf , ulen ))
145
+ == NULL )
146
+ goto done ;
147
+ if ((rv = file_softmagic (ms , utf8_buf ,
148
+ (size_t )(utf8_end - utf8_buf ), TEXTTEST , text )) == 0 )
149
+ rv = -1 ;
187
150
}
188
151
189
- subtype_identified :
190
-
191
152
/* Now try to discover other details about the file. */
192
153
for (i = 0 ; i < ulen ; i ++ ) {
193
154
if (ubuf [i ] == '\n' ) {
@@ -230,7 +191,7 @@ file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf,
230
191
goto done ;
231
192
}
232
193
if (mime ) {
233
- if ((mime & MAGIC_MIME_TYPE ) != 0 ) {
194
+ if (! file_printedlen ( ms ) && (mime & MAGIC_MIME_TYPE ) != 0 ) {
234
195
if (subtype_mime ) {
235
196
if (file_printf (ms , "%s" , subtype_mime ) == -1 )
236
197
goto done ;
@@ -240,6 +201,28 @@ file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf,
240
201
}
241
202
}
242
203
} else {
204
+ if (file_printedlen (ms )) {
205
+ switch (file_replace (ms , " text$" , ", " )) {
206
+ case 0 :
207
+ switch (file_replace (ms , " text executable$" ,
208
+ ", " )) {
209
+ case 0 :
210
+ if (file_printf (ms , ", " ) == -1 )
211
+ goto done ;
212
+ case -1 :
213
+ goto done ;
214
+ default :
215
+ executable = 1 ;
216
+ break ;
217
+ }
218
+ break ;
219
+ case -1 :
220
+ goto done ;
221
+ default :
222
+ break ;
223
+ }
224
+ }
225
+
243
226
if (file_printf (ms , "%s" , code ) == -1 )
244
227
goto done ;
245
228
@@ -251,6 +234,10 @@ file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf,
251
234
if (file_printf (ms , " %s" , type ) == -1 )
252
235
goto done ;
253
236
237
+ if (executable )
238
+ if (file_printf (ms , " executable" ) == -1 )
239
+ goto done ;
240
+
254
241
if (has_long_lines )
255
242
if (file_printf (ms , ", with very long lines" ) == -1 )
256
243
goto done ;
@@ -313,22 +300,6 @@ file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf,
313
300
return rv ;
314
301
}
315
302
316
- private int
317
- ascmatch (const unsigned char * s , const unichar * us , size_t ulen )
318
- {
319
- size_t i ;
320
-
321
- for (i = 0 ; i < ulen ; i ++ ) {
322
- if (s [i ] != us [i ])
323
- return 0 ;
324
- }
325
-
326
- if (s [i ])
327
- return 0 ;
328
- else
329
- return 1 ;
330
- }
331
-
332
303
/*
333
304
* Encode Unicode string as UTF-8, returning pointer to character
334
305
* after end of string, or NULL if an invalid character is found.
0 commit comments