|
1 # Copyright (C) 2012-2018 Steven Myint |
|
2 # |
|
3 # Permission is hereby granted, free of charge, to any person obtaining |
|
4 # a copy of this software and associated documentation files (the |
|
5 # "Software"), to deal in the Software without restriction, including |
|
6 # without limitation the rights to use, copy, modify, merge, publish, |
|
7 # distribute, sublicense, and/or sell copies of the Software, and to |
|
8 # permit persons to whom the Software is furnished to do so, subject to |
|
9 # the following conditions: |
|
10 # |
|
11 # The above copyright notice and this permission notice shall be included |
|
12 # in all copies or substantial portions of the Software. |
|
13 # |
|
14 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
|
15 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
|
16 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
|
17 # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
|
18 # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
|
19 # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
|
20 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
|
21 |
|
22 """Removes commented-out Python code.""" |
|
23 |
|
24 from __future__ import print_function |
|
25 from __future__ import unicode_literals |
|
26 |
|
27 import difflib |
|
28 import io |
|
29 import os |
|
30 import re |
|
31 import tokenize |
|
32 |
|
33 __version__ = '2.0.0' |
|
34 |
|
35 |
|
36 class Eradicator(object): |
|
37 """Eradicate comments.""" |
|
38 BRACKET_REGEX = re.compile(r'^[()\[\]{}\s]+$') |
|
39 CODING_COMMENT_REGEX = re.compile(r'.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)') |
|
40 DEF_STATEMENT_REGEX = re.compile(r"def .+\)[\s]+->[\s]+[a-zA-Z_][a-zA-Z0-9_]*:$") |
|
41 FOR_STATEMENT_REGEX = re.compile(r"for [a-zA-Z_][a-zA-Z0-9_]* in .+:$") |
|
42 HASH_NUMBER = re.compile(r'#[0-9]') |
|
43 MULTILINE_ASSIGNMENT_REGEX = re.compile(r'^\s*\w+\s*=.*[(\[{]$') |
|
44 PARTIAL_DICTIONARY_REGEX = re.compile(r'^\s*[\'"]\w+[\'"]\s*:.+[,{]\s*$') |
|
45 PRINT_RETURN_REGEX = re.compile(r'^(print|return)\b\s*') |
|
46 WITH_STATEMENT_REGEX = re.compile(r"with .+ as [a-zA-Z_][a-zA-Z0-9_]*:$") |
|
47 |
|
48 CODE_INDICATORS = ['(', ')', '[', ']', '{', '}', ':', '=', '%', |
|
49 'print', 'return', 'break', 'continue', 'import'] |
|
50 CODE_KEYWORDS = [r'elif\s+.*', 'else', 'try', 'finally', r'except\s+.*'] |
|
51 CODE_KEYWORDS_AGGR = CODE_KEYWORDS + [r'if\s+.*'] |
|
52 WHITESPACE_HASH = ' \t\v\n#' |
|
53 |
|
54 DEFAULT_WHITELIST = ( |
|
55 r'pylint', |
|
56 r'pyright', |
|
57 r'noqa', |
|
58 r'type:\s*ignore', |
|
59 r'fmt:\s*(on|off)', |
|
60 r'TODO', |
|
61 r'FIXME', |
|
62 r'XXX' |
|
63 ) |
|
64 WHITELIST_REGEX = re.compile(r'|'.join(DEFAULT_WHITELIST), flags=re.IGNORECASE) |
|
65 |
|
66 def comment_contains_code(self, line, aggressive=True): |
|
67 """Return True comment contains code.""" |
|
68 line = line.lstrip() |
|
69 if not line.startswith('#'): |
|
70 return False |
|
71 |
|
72 line = line.lstrip(self.WHITESPACE_HASH).strip() |
|
73 |
|
74 # Ignore non-comment related hashes. For example, "# Issue #999". |
|
75 if self.HASH_NUMBER.search(line): |
|
76 return False |
|
77 |
|
78 # Ignore whitelisted comments |
|
79 if self.WHITELIST_REGEX.search(line): |
|
80 return False |
|
81 |
|
82 if self.CODING_COMMENT_REGEX.match(line): |
|
83 return False |
|
84 |
|
85 # Check that this is possibly code. |
|
86 for symbol in self.CODE_INDICATORS: |
|
87 if symbol in line: |
|
88 break |
|
89 else: |
|
90 return False |
|
91 |
|
92 if self.multiline_case(line, aggressive=aggressive): |
|
93 return True |
|
94 |
|
95 for symbol in self.CODE_KEYWORDS_AGGR if aggressive else self.CODE_KEYWORDS: |
|
96 if re.match(r'^\s*' + symbol + r'\s*:\s*$', line): |
|
97 return True |
|
98 |
|
99 line = self.PRINT_RETURN_REGEX.sub('', line) |
|
100 |
|
101 if self.PARTIAL_DICTIONARY_REGEX.match(line): |
|
102 return True |
|
103 |
|
104 try: |
|
105 compile(line, '<string>', 'exec') |
|
106 except (SyntaxError, TypeError, UnicodeDecodeError): |
|
107 return False |
|
108 else: |
|
109 return True |
|
110 |
|
111 |
|
112 def multiline_case(self, line, aggressive=True): |
|
113 """Return True if line is probably part of some multiline code.""" |
|
114 if aggressive: |
|
115 for ending in ')]}': |
|
116 if line.endswith(ending + ':'): |
|
117 return True |
|
118 |
|
119 if line.strip() == ending + ',': |
|
120 return True |
|
121 |
|
122 # Check whether a function/method definition with return value |
|
123 # annotation |
|
124 if self.DEF_STATEMENT_REGEX.search(line): |
|
125 return True |
|
126 |
|
127 # Check weather a with statement |
|
128 if self.WITH_STATEMENT_REGEX.search(line): |
|
129 return True |
|
130 |
|
131 # Check weather a for statement |
|
132 if self.FOR_STATEMENT_REGEX.search(line): |
|
133 return True |
|
134 |
|
135 if line.endswith('\\'): |
|
136 return True |
|
137 |
|
138 if self.MULTILINE_ASSIGNMENT_REGEX.match(line): |
|
139 return True |
|
140 |
|
141 if self.BRACKET_REGEX.match(line): |
|
142 return True |
|
143 |
|
144 return False |
|
145 |
|
146 |
|
147 def commented_out_code_line_numbers(self, source, aggressive=True): |
|
148 """Yield line numbers of commented-out code.""" |
|
149 sio = io.StringIO(source) |
|
150 try: |
|
151 for token in tokenize.generate_tokens(sio.readline): |
|
152 token_type = token[0] |
|
153 start_row = token[2][0] |
|
154 line = token[4] |
|
155 |
|
156 if (token_type == tokenize.COMMENT and |
|
157 line.lstrip().startswith('#') and |
|
158 self.comment_contains_code(line, aggressive)): |
|
159 yield start_row |
|
160 except (tokenize.TokenError, IndentationError): |
|
161 pass |
|
162 |
|
163 |
|
164 def filter_commented_out_code(self, source, aggressive=True): |
|
165 """Yield code with commented out code removed.""" |
|
166 marked_lines = list(self.commented_out_code_line_numbers(source, |
|
167 aggressive)) |
|
168 sio = io.StringIO(source) |
|
169 previous_line = '' |
|
170 for line_number, line in enumerate(sio.readlines(), start=1): |
|
171 if (line_number not in marked_lines or |
|
172 previous_line.rstrip().endswith('\\')): |
|
173 yield line |
|
174 previous_line = line |
|
175 |
|
176 |
|
177 def fix_file(self, filename, args, standard_out): |
|
178 """Run filter_commented_out_code() on file.""" |
|
179 encoding = self.detect_encoding(filename) |
|
180 with self.open_with_encoding(filename, encoding=encoding) as input_file: |
|
181 source = input_file.read() |
|
182 |
|
183 filtered_source = ''.join(self.filter_commented_out_code(source, |
|
184 args.aggressive)) |
|
185 |
|
186 if source != filtered_source: |
|
187 if args.in_place: |
|
188 with self.open_with_encoding(filename, mode='w', |
|
189 encoding=encoding) as output_file: |
|
190 output_file.write(filtered_source) |
|
191 else: |
|
192 diff = difflib.unified_diff( |
|
193 source.splitlines(), |
|
194 filtered_source.splitlines(), |
|
195 'before/' + filename, |
|
196 'after/' + filename, |
|
197 lineterm='') |
|
198 standard_out.write('\n'.join(list(diff) + [''])) |
|
199 return True |
|
200 |
|
201 |
|
202 def open_with_encoding(self, filename, encoding, mode='r'): |
|
203 """Return opened file with a specific encoding.""" |
|
204 return io.open(filename, mode=mode, encoding=encoding, |
|
205 newline='') # Preserve line endings |
|
206 |
|
207 |
|
208 def detect_encoding(self, filename): |
|
209 """Return file encoding.""" |
|
210 try: |
|
211 with open(filename, 'rb') as input_file: |
|
212 from lib2to3.pgen2 import tokenize as lib2to3_tokenize |
|
213 encoding = lib2to3_tokenize.detect_encoding(input_file.readline)[0] |
|
214 |
|
215 # Check for correctness of encoding. |
|
216 with self.open_with_encoding(filename, encoding) as input_file: |
|
217 input_file.read() |
|
218 |
|
219 return encoding |
|
220 except (SyntaxError, LookupError, UnicodeDecodeError): |
|
221 return 'latin-1' |
|
222 |
|
223 def update_whitelist(self, new_whitelist, extend_default=True): |
|
224 """Updates the whitelist.""" |
|
225 if extend_default: |
|
226 self.WHITELIST_REGEX = re.compile( |
|
227 r'|'.join(list(self.DEFAULT_WHITELIST) + new_whitelist), |
|
228 flags=re.IGNORECASE) |
|
229 else: |
|
230 self.WHITELIST_REGEX = re.compile( |
|
231 r'|'.join(new_whitelist), |
|
232 flags=re.IGNORECASE) |
|
233 |
|
234 |
|
235 def main(argv, standard_out, standard_error): |
|
236 """Main entry point.""" |
|
237 import argparse |
|
238 parser = argparse.ArgumentParser(description=__doc__, prog='eradicate') |
|
239 parser.add_argument('-i', '--in-place', action='store_true', |
|
240 help='make changes to files instead of printing diffs') |
|
241 parser.add_argument('-r', '--recursive', action='store_true', |
|
242 help='drill down directories recursively') |
|
243 parser.add_argument('-a', '--aggressive', action='store_true', |
|
244 help='make more aggressive changes; ' |
|
245 'this may result in false positives') |
|
246 parser.add_argument('-e', '--error', action="store_true", |
|
247 help="Exit code based on result of check") |
|
248 parser.add_argument('--version', action='version', |
|
249 version='%(prog)s ' + __version__) |
|
250 parser.add_argument('--whitelist', action="store", |
|
251 help=( |
|
252 'String of "#" separated comment beginnings to whitelist. ' |
|
253 'Single parts are interpreted as regex. ' |
|
254 'OVERWRITING the default whitelist: {}' |
|
255 ).format(Eradicator.DEFAULT_WHITELIST)) |
|
256 parser.add_argument('--whitelist-extend', action="store", |
|
257 help=( |
|
258 'String of "#" separated comment beginnings to whitelist ' |
|
259 'Single parts are interpreted as regex. ' |
|
260 'Overwrites --whitelist. ' |
|
261 'EXTENDING the default whitelist: {} ' |
|
262 ).format(Eradicator.DEFAULT_WHITELIST)) |
|
263 parser.add_argument('files', nargs='+', help='files to format') |
|
264 |
|
265 args = parser.parse_args(argv[1:]) |
|
266 |
|
267 eradicator = Eradicator() |
|
268 |
|
269 if args.whitelist_extend: |
|
270 eradicator.update_whitelist(args.whitelist_extend.split('#'), True) |
|
271 elif args.whitelist: |
|
272 eradicator.update_whitelist(args.whitelist.split('#'), False) |
|
273 |
|
274 filenames = list(set(args.files)) |
|
275 change_or_error = False |
|
276 while filenames: |
|
277 name = filenames.pop(0) |
|
278 if args.recursive and os.path.isdir(name): |
|
279 for root, directories, children in os.walk('{}'.format(name)): |
|
280 filenames += [os.path.join(root, f) for f in children |
|
281 if f.endswith('.py') and |
|
282 not f.startswith('.')] |
|
283 directories[:] = [d for d in directories |
|
284 if not d.startswith('.')] |
|
285 else: |
|
286 try: |
|
287 change_or_error = eradicator.fix_file(name, args=args, standard_out=standard_out) or change_or_error |
|
288 except IOError as exception: |
|
289 print('{}'.format(exception), file=standard_error) |
|
290 change_or_error = True |
|
291 if change_or_error and args.error: |
|
292 return 1 |