|
1 # -*- coding: utf-8 -*- |
|
2 """ |
|
3 pygments.lexers.textfmts |
|
4 ~~~~~~~~~~~~~~~~~~~~~~~~ |
|
5 |
|
6 Lexers for various text formats. |
|
7 |
|
8 :copyright: Copyright 2006-2017 by the Pygments team, see AUTHORS. |
|
9 :license: BSD, see LICENSE for details. |
|
10 """ |
|
11 |
|
12 import re |
|
13 |
|
14 from pygments.lexer import RegexLexer, bygroups |
|
15 from pygments.token import Text, Comment, Operator, Keyword, Name, String, \ |
|
16 Number, Generic, Literal |
|
17 from pygments.util import ClassNotFound |
|
18 |
|
19 __all__ = ['IrcLogsLexer', 'TodotxtLexer', 'HttpLexer', 'GettextLexer'] |
|
20 |
|
21 |
|
22 class IrcLogsLexer(RegexLexer): |
|
23 """ |
|
24 Lexer for IRC logs in *irssi*, *xchat* or *weechat* style. |
|
25 """ |
|
26 |
|
27 name = 'IRC logs' |
|
28 aliases = ['irc'] |
|
29 filenames = ['*.weechatlog'] |
|
30 mimetypes = ['text/x-irclog'] |
|
31 |
|
32 flags = re.VERBOSE | re.MULTILINE |
|
33 timestamp = r""" |
|
34 ( |
|
35 # irssi / xchat and others |
|
36 (?: \[|\()? # Opening bracket or paren for the timestamp |
|
37 (?: # Timestamp |
|
38 (?: (?:\d{1,4} [-/])* # Date as - or /-separated groups of digits |
|
39 (?:\d{1,4}) |
|
40 [T ])? # Date/time separator: T or space |
|
41 (?: \d?\d [:.])* # Time as :/.-separated groups of 1 or 2 digits |
|
42 (?: \d?\d) |
|
43 ) |
|
44 (?: \]|\))?\s+ # Closing bracket or paren for the timestamp |
|
45 | |
|
46 # weechat |
|
47 \d{4}\s\w{3}\s\d{2}\s # Date |
|
48 \d{2}:\d{2}:\d{2}\s+ # Time + Whitespace |
|
49 | |
|
50 # xchat |
|
51 \w{3}\s\d{2}\s # Date |
|
52 \d{2}:\d{2}:\d{2}\s+ # Time + Whitespace |
|
53 )? |
|
54 """ |
|
55 tokens = { |
|
56 'root': [ |
|
57 # log start/end |
|
58 (r'^\*\*\*\*(.*)\*\*\*\*$', Comment), |
|
59 # hack |
|
60 ("^" + timestamp + r'(\s*<[^>]*>\s*)$', bygroups(Comment.Preproc, Name.Tag)), |
|
61 # normal msgs |
|
62 ("^" + timestamp + r""" |
|
63 (\s*<.*?>\s*) # Nick """, |
|
64 bygroups(Comment.Preproc, Name.Tag), 'msg'), |
|
65 # /me msgs |
|
66 ("^" + timestamp + r""" |
|
67 (\s*[*]\s+) # Star |
|
68 (\S+\s+.*?\n) # Nick + rest of message """, |
|
69 bygroups(Comment.Preproc, Keyword, Generic.Inserted)), |
|
70 # join/part msgs |
|
71 ("^" + timestamp + r""" |
|
72 (\s*(?:\*{3}|<?-[!@=P]?->?)\s*) # Star(s) or symbols |
|
73 (\S+\s+) # Nick + Space |
|
74 (.*?\n) # Rest of message """, |
|
75 bygroups(Comment.Preproc, Keyword, String, Comment)), |
|
76 (r"^.*?\n", Text), |
|
77 ], |
|
78 'msg': [ |
|
79 (r"\S+:(?!//)", Name.Attribute), # Prefix |
|
80 (r".*\n", Text, '#pop'), |
|
81 ], |
|
82 } |
|
83 |
|
84 |
|
85 class GettextLexer(RegexLexer): |
|
86 """ |
|
87 Lexer for Gettext catalog files. |
|
88 |
|
89 .. versionadded:: 0.9 |
|
90 """ |
|
91 name = 'Gettext Catalog' |
|
92 aliases = ['pot', 'po'] |
|
93 filenames = ['*.pot', '*.po'] |
|
94 mimetypes = ['application/x-gettext', 'text/x-gettext', 'text/gettext'] |
|
95 |
|
96 tokens = { |
|
97 'root': [ |
|
98 (r'^#,\s.*?$', Keyword.Type), |
|
99 (r'^#:\s.*?$', Keyword.Declaration), |
|
100 # (r'^#$', Comment), |
|
101 (r'^(#|#\.\s|#\|\s|#~\s|#\s).*$', Comment.Single), |
|
102 (r'^(")([A-Za-z-]+:)(.*")$', |
|
103 bygroups(String, Name.Property, String)), |
|
104 (r'^".*"$', String), |
|
105 (r'^(msgid|msgid_plural|msgstr|msgctxt)(\s+)(".*")$', |
|
106 bygroups(Name.Variable, Text, String)), |
|
107 (r'^(msgstr\[)(\d)(\])(\s+)(".*")$', |
|
108 bygroups(Name.Variable, Number.Integer, Name.Variable, Text, String)), |
|
109 ] |
|
110 } |
|
111 |
|
112 |
|
113 class HttpLexer(RegexLexer): |
|
114 """ |
|
115 Lexer for HTTP sessions. |
|
116 |
|
117 .. versionadded:: 1.5 |
|
118 """ |
|
119 |
|
120 name = 'HTTP' |
|
121 aliases = ['http'] |
|
122 |
|
123 flags = re.DOTALL |
|
124 |
|
125 def get_tokens_unprocessed(self, text, stack=('root',)): |
|
126 """Reset the content-type state.""" |
|
127 self.content_type = None |
|
128 return RegexLexer.get_tokens_unprocessed(self, text, stack) |
|
129 |
|
130 def header_callback(self, match): |
|
131 if match.group(1).lower() == 'content-type': |
|
132 content_type = match.group(5).strip() |
|
133 if ';' in content_type: |
|
134 content_type = content_type[:content_type.find(';')].strip() |
|
135 self.content_type = content_type |
|
136 yield match.start(1), Name.Attribute, match.group(1) |
|
137 yield match.start(2), Text, match.group(2) |
|
138 yield match.start(3), Operator, match.group(3) |
|
139 yield match.start(4), Text, match.group(4) |
|
140 yield match.start(5), Literal, match.group(5) |
|
141 yield match.start(6), Text, match.group(6) |
|
142 |
|
143 def continuous_header_callback(self, match): |
|
144 yield match.start(1), Text, match.group(1) |
|
145 yield match.start(2), Literal, match.group(2) |
|
146 yield match.start(3), Text, match.group(3) |
|
147 |
|
148 def content_callback(self, match): |
|
149 content_type = getattr(self, 'content_type', None) |
|
150 content = match.group() |
|
151 offset = match.start() |
|
152 if content_type: |
|
153 from pygments.lexers import get_lexer_for_mimetype |
|
154 possible_lexer_mimetypes = [content_type] |
|
155 if '+' in content_type: |
|
156 # application/calendar+xml can be treated as application/xml |
|
157 # if there's not a better match. |
|
158 general_type = re.sub(r'^(.*)/.*\+(.*)$', r'\1/\2', |
|
159 content_type) |
|
160 possible_lexer_mimetypes.append(general_type) |
|
161 |
|
162 for i in possible_lexer_mimetypes: |
|
163 try: |
|
164 lexer = get_lexer_for_mimetype(i) |
|
165 except ClassNotFound: |
|
166 pass |
|
167 else: |
|
168 for idx, token, value in lexer.get_tokens_unprocessed(content): |
|
169 yield offset + idx, token, value |
|
170 return |
|
171 yield offset, Text, content |
|
172 |
|
173 tokens = { |
|
174 'root': [ |
|
175 (r'(GET|POST|PUT|DELETE|HEAD|OPTIONS|TRACE|PATCH)( +)([^ ]+)( +)' |
|
176 r'(HTTP)(/)(1\.[01])(\r?\n|\Z)', |
|
177 bygroups(Name.Function, Text, Name.Namespace, Text, |
|
178 Keyword.Reserved, Operator, Number, Text), |
|
179 'headers'), |
|
180 (r'(HTTP)(/)(1\.[01])( +)(\d{3})( +)([^\r\n]+)(\r?\n|\Z)', |
|
181 bygroups(Keyword.Reserved, Operator, Number, Text, Number, |
|
182 Text, Name.Exception, Text), |
|
183 'headers'), |
|
184 ], |
|
185 'headers': [ |
|
186 (r'([^\s:]+)( *)(:)( *)([^\r\n]+)(\r?\n|\Z)', header_callback), |
|
187 (r'([\t ]+)([^\r\n]+)(\r?\n|\Z)', continuous_header_callback), |
|
188 (r'\r?\n', Text, 'content') |
|
189 ], |
|
190 'content': [ |
|
191 (r'.+', content_callback) |
|
192 ] |
|
193 } |
|
194 |
|
195 def analyse_text(text): |
|
196 return text.startswith(('GET /', 'POST /', 'PUT /', 'DELETE /', 'HEAD /', |
|
197 'OPTIONS /', 'TRACE /', 'PATCH /')) |
|
198 |
|
199 |
|
200 class TodotxtLexer(RegexLexer): |
|
201 """ |
|
202 Lexer for `Todo.txt <http://todotxt.com/>`_ todo list format. |
|
203 |
|
204 .. versionadded:: 2.0 |
|
205 """ |
|
206 |
|
207 name = 'Todotxt' |
|
208 aliases = ['todotxt'] |
|
209 # *.todotxt is not a standard extension for Todo.txt files; including it |
|
210 # makes testing easier, and also makes autodetecting file type easier. |
|
211 filenames = ['todo.txt', '*.todotxt'] |
|
212 mimetypes = ['text/x-todo'] |
|
213 |
|
214 # Aliases mapping standard token types of Todo.txt format concepts |
|
215 CompleteTaskText = Operator # Chosen to de-emphasize complete tasks |
|
216 IncompleteTaskText = Text # Incomplete tasks should look like plain text |
|
217 |
|
218 # Priority should have most emphasis to indicate importance of tasks |
|
219 Priority = Generic.Heading |
|
220 # Dates should have next most emphasis because time is important |
|
221 Date = Generic.Subheading |
|
222 |
|
223 # Project and context should have equal weight, and be in different colors |
|
224 Project = Generic.Error |
|
225 Context = String |
|
226 |
|
227 # If tag functionality is added, it should have the same weight as Project |
|
228 # and Context, and a different color. Generic.Traceback would work well. |
|
229 |
|
230 # Regex patterns for building up rules; dates, priorities, projects, and |
|
231 # contexts are all atomic |
|
232 # TODO: Make date regex more ISO 8601 compliant |
|
233 date_regex = r'\d{4,}-\d{2}-\d{2}' |
|
234 priority_regex = r'\([A-Z]\)' |
|
235 project_regex = r'\+\S+' |
|
236 context_regex = r'@\S+' |
|
237 |
|
238 # Compound regex expressions |
|
239 complete_one_date_regex = r'(x )(' + date_regex + r')' |
|
240 complete_two_date_regex = (complete_one_date_regex + r'( )(' + |
|
241 date_regex + r')') |
|
242 priority_date_regex = r'(' + priority_regex + r')( )(' + date_regex + r')' |
|
243 |
|
244 tokens = { |
|
245 # Should parse starting at beginning of line; each line is a task |
|
246 'root': [ |
|
247 # Complete task entry points: two total: |
|
248 # 1. Complete task with two dates |
|
249 (complete_two_date_regex, bygroups(CompleteTaskText, Date, |
|
250 CompleteTaskText, Date), |
|
251 'complete'), |
|
252 # 2. Complete task with one date |
|
253 (complete_one_date_regex, bygroups(CompleteTaskText, Date), |
|
254 'complete'), |
|
255 |
|
256 # Incomplete task entry points: six total: |
|
257 # 1. Priority plus date |
|
258 (priority_date_regex, bygroups(Priority, IncompleteTaskText, Date), |
|
259 'incomplete'), |
|
260 # 2. Priority only |
|
261 (priority_regex, Priority, 'incomplete'), |
|
262 # 3. Leading date |
|
263 (date_regex, Date, 'incomplete'), |
|
264 # 4. Leading context |
|
265 (context_regex, Context, 'incomplete'), |
|
266 # 5. Leading project |
|
267 (project_regex, Project, 'incomplete'), |
|
268 # 6. Non-whitespace catch-all |
|
269 (r'\S+', IncompleteTaskText, 'incomplete'), |
|
270 ], |
|
271 |
|
272 # Parse a complete task |
|
273 'complete': [ |
|
274 # Newline indicates end of task, should return to root |
|
275 (r'\s*\n', CompleteTaskText, '#pop'), |
|
276 # Tokenize contexts and projects |
|
277 (context_regex, Context), |
|
278 (project_regex, Project), |
|
279 # Tokenize non-whitespace text |
|
280 (r'\S+', CompleteTaskText), |
|
281 # Tokenize whitespace not containing a newline |
|
282 (r'\s+', CompleteTaskText), |
|
283 ], |
|
284 |
|
285 # Parse an incomplete task |
|
286 'incomplete': [ |
|
287 # Newline indicates end of task, should return to root |
|
288 (r'\s*\n', IncompleteTaskText, '#pop'), |
|
289 # Tokenize contexts and projects |
|
290 (context_regex, Context), |
|
291 (project_regex, Project), |
|
292 # Tokenize non-whitespace text |
|
293 (r'\S+', IncompleteTaskText), |
|
294 # Tokenize whitespace not containing a newline |
|
295 (r'\s+', IncompleteTaskText), |
|
296 ], |
|
297 } |