19 # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
19 # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
20 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
20 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
21 |
21 |
22 """Removes commented-out Python code.""" |
22 """Removes commented-out Python code.""" |
23 |
23 |
|
24 from __future__ import print_function |
|
25 from __future__ import unicode_literals |
|
26 |
24 import difflib |
27 import difflib |
25 import io |
28 import io |
26 import os |
29 import os |
27 import re |
30 import re |
28 import tokenize |
31 import tokenize |
29 |
32 |
30 __version__ = '1.0' |
33 __version__ = '2.0.0' |
31 |
34 |
32 |
35 |
33 MULTILINE_ASSIGNMENT_REGEX = re.compile(r'^\s*\w+\s*=.*[(\[{]$') |
36 class Eradicator(object): |
34 PARTIAL_DICTIONARY_REGEX = re.compile(r'^\s*[\'"]\w+[\'"]\s*:.+[,{]\s*$') |
37 """Eradicate comments.""" |
35 |
38 BRACKET_REGEX = re.compile(r'^[()\[\]{}\s]+$') |
36 |
39 CODING_COMMENT_REGEX = re.compile(r'.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)') |
37 def comment_contains_code(line, aggressive=True): |
40 DEF_STATEMENT_REGEX = re.compile(r"def .+\)[\s]+->[\s]+[a-zA-Z_][a-zA-Z0-9_]*:$") |
38 """Return True comment contains code.""" |
41 FOR_STATEMENT_REGEX = re.compile(r"for [a-zA-Z_][a-zA-Z0-9_]* in .+:$") |
39 line = line.lstrip() |
42 HASH_NUMBER = re.compile(r'#[0-9]') |
40 if not line.startswith('#'): |
43 MULTILINE_ASSIGNMENT_REGEX = re.compile(r'^\s*\w+\s*=.*[(\[{]$') |
|
44 PARTIAL_DICTIONARY_REGEX = re.compile(r'^\s*[\'"]\w+[\'"]\s*:.+[,{]\s*$') |
|
45 PRINT_RETURN_REGEX = re.compile(r'^(print|return)\b\s*') |
|
46 WITH_STATEMENT_REGEX = re.compile(r"with .+ as [a-zA-Z_][a-zA-Z0-9_]*:$") |
|
47 |
|
48 CODE_INDICATORS = ['(', ')', '[', ']', '{', '}', ':', '=', '%', |
|
49 'print', 'return', 'break', 'continue', 'import'] |
|
50 CODE_KEYWORDS = [r'elif\s+.*', 'else', 'try', 'finally', r'except\s+.*'] |
|
51 CODE_KEYWORDS_AGGR = CODE_KEYWORDS + [r'if\s+.*'] |
|
52 WHITESPACE_HASH = ' \t\v\n#' |
|
53 |
|
54 DEFAULT_WHITELIST = ( |
|
55 r'pylint', |
|
56 r'pyright', |
|
57 r'noqa', |
|
58 r'type:\s*ignore', |
|
59 r'fmt:\s*(on|off)', |
|
60 r'TODO', |
|
61 r'FIXME', |
|
62 r'XXX' |
|
63 ) |
|
64 WHITELIST_REGEX = re.compile(r'|'.join(DEFAULT_WHITELIST), flags=re.IGNORECASE) |
|
65 |
|
66 def comment_contains_code(self, line, aggressive=True): |
|
67 """Return True comment contains code.""" |
|
68 line = line.lstrip() |
|
69 if not line.startswith('#'): |
|
70 return False |
|
71 |
|
72 line = line.lstrip(self.WHITESPACE_HASH).strip() |
|
73 |
|
74 # Ignore non-comment related hashes. For example, "# Issue #999". |
|
75 if self.HASH_NUMBER.search(line): |
|
76 return False |
|
77 |
|
78 # Ignore whitelisted comments |
|
79 if self.WHITELIST_REGEX.search(line): |
|
80 return False |
|
81 |
|
82 if self.CODING_COMMENT_REGEX.match(line): |
|
83 return False |
|
84 |
|
85 # Check that this is possibly code. |
|
86 for symbol in self.CODE_INDICATORS: |
|
87 if symbol in line: |
|
88 break |
|
89 else: |
|
90 return False |
|
91 |
|
92 if self.multiline_case(line, aggressive=aggressive): |
|
93 return True |
|
94 |
|
95 for symbol in self.CODE_KEYWORDS_AGGR if aggressive else self.CODE_KEYWORDS: |
|
96 if re.match(r'^\s*' + symbol + r'\s*:\s*$', line): |
|
97 return True |
|
98 |
|
99 line = self.PRINT_RETURN_REGEX.sub('', line) |
|
100 |
|
101 if self.PARTIAL_DICTIONARY_REGEX.match(line): |
|
102 return True |
|
103 |
|
104 try: |
|
105 compile(line, '<string>', 'exec') |
|
106 except (SyntaxError, TypeError, UnicodeDecodeError): |
|
107 return False |
|
108 else: |
|
109 return True |
|
110 |
|
111 |
|
112 def multiline_case(self, line, aggressive=True): |
|
113 """Return True if line is probably part of some multiline code.""" |
|
114 if aggressive: |
|
115 for ending in ')]}': |
|
116 if line.endswith(ending + ':'): |
|
117 return True |
|
118 |
|
119 if line.strip() == ending + ',': |
|
120 return True |
|
121 |
|
122 # Check whether a function/method definition with return value |
|
123 # annotation |
|
124 if self.DEF_STATEMENT_REGEX.search(line): |
|
125 return True |
|
126 |
|
127 # Check weather a with statement |
|
128 if self.WITH_STATEMENT_REGEX.search(line): |
|
129 return True |
|
130 |
|
131 # Check weather a for statement |
|
132 if self.FOR_STATEMENT_REGEX.search(line): |
|
133 return True |
|
134 |
|
135 if line.endswith('\\'): |
|
136 return True |
|
137 |
|
138 if self.MULTILINE_ASSIGNMENT_REGEX.match(line): |
|
139 return True |
|
140 |
|
141 if self.BRACKET_REGEX.match(line): |
|
142 return True |
|
143 |
41 return False |
144 return False |
42 |
145 |
43 line = line.lstrip(' \t\v\n#').strip() |
146 |
44 |
147 def commented_out_code_line_numbers(self, source, aggressive=True): |
45 # Ignore non-comment related hashes. For example, "# Issue #999". |
148 """Yield line numbers of commented-out code.""" |
46 if re.search('#[0-9]', line): |
149 sio = io.StringIO(source) |
47 return False |
150 try: |
48 |
151 for token in tokenize.generate_tokens(sio.readline): |
49 if line.startswith('pylint:'): |
152 token_type = token[0] |
50 return False |
153 start_row = token[2][0] |
51 |
154 line = token[4] |
52 if re.match(r'.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)', line): |
155 |
53 return False |
156 if (token_type == tokenize.COMMENT and |
54 |
157 line.lstrip().startswith('#') and |
55 # Check that this is possibly code. |
158 self.comment_contains_code(line, aggressive)): |
56 for symbol in list('()[]{}:=%') + ['print', 'return', 'break', 'continue', |
159 yield start_row |
57 'import']: |
160 except (tokenize.TokenError, IndentationError): |
58 if symbol in line: |
161 pass |
59 break |
162 |
60 else: |
163 |
61 return False |
164 def filter_commented_out_code(self, source, aggressive=True): |
62 |
165 """Yield code with commented out code removed.""" |
63 if multiline_case(line, aggressive=aggressive): |
166 marked_lines = list(self.commented_out_code_line_numbers(source, |
64 return True |
167 aggressive)) |
65 |
168 sio = io.StringIO(source) |
66 symbol_list = [r'elif\s+.*', 'else', 'try', |
169 previous_line = '' |
67 'finally', r'except\s+.*'] |
170 for line_number, line in enumerate(sio.readlines(), start=1): |
68 if aggressive: |
171 if (line_number not in marked_lines or |
69 symbol_list.append(r'if\s+.*') |
172 previous_line.rstrip().endswith('\\')): |
70 |
173 yield line |
71 for symbol in symbol_list: |
174 previous_line = line |
72 if re.match(r'^\s*' + symbol + r'\s*:\s*$', line): |
175 |
73 return True |
176 |
74 |
177 def fix_file(self, filename, args, standard_out): |
75 line = re.sub(r'^(print|return)\b\s*', '', line) |
178 """Run filter_commented_out_code() on file.""" |
76 |
179 encoding = self.detect_encoding(filename) |
77 if re.match(PARTIAL_DICTIONARY_REGEX, line): |
180 with self.open_with_encoding(filename, encoding=encoding) as input_file: |
78 return True |
181 source = input_file.read() |
79 |
182 |
80 try: |
183 filtered_source = ''.join(self.filter_commented_out_code(source, |
81 compile(line, '<string>', 'exec') |
184 args.aggressive)) |
82 return True |
185 |
83 except (SyntaxError, TypeError, UnicodeDecodeError): |
186 if source != filtered_source: |
84 return False |
187 if args.in_place: |
85 |
188 with self.open_with_encoding(filename, mode='w', |
86 |
189 encoding=encoding) as output_file: |
87 def multiline_case(line, aggressive=True): |
190 output_file.write(filtered_source) |
88 """Return True if line is probably part of some multiline code.""" |
191 else: |
89 if aggressive: |
192 diff = difflib.unified_diff( |
90 for ending in ')]}': |
193 source.splitlines(), |
91 if line.endswith(ending + ':'): |
194 filtered_source.splitlines(), |
92 return True |
195 'before/' + filename, |
93 |
196 'after/' + filename, |
94 if line.strip() == ending + ',': |
197 lineterm='') |
95 return True |
198 standard_out.write('\n'.join(list(diff) + [''])) |
96 |
199 return True |
97 # Check whether a function/method definition with return value |
200 |
98 # annotation |
201 |
99 if re.search(r"def .+\)[\s]+->[\s]+[a-zA-Z_][a-zA-Z0-9_]*:$", line): |
202 def open_with_encoding(self, filename, encoding, mode='r'): |
100 return True |
203 """Return opened file with a specific encoding.""" |
101 |
204 return io.open(filename, mode=mode, encoding=encoding, |
102 # Check weather a with statement |
205 newline='') # Preserve line endings |
103 if re.search(r"with .+ as [a-zA-Z_][a-zA-Z0-9_]*:$", line): |
206 |
104 return True |
207 |
105 |
208 def detect_encoding(self, filename): |
106 # Check weather a for statement |
209 """Return file encoding.""" |
107 if re.search(r"for [a-zA-Z_][a-zA-Z0-9_]* in .+:$", line): |
210 try: |
108 return True |
211 with open(filename, 'rb') as input_file: |
109 |
212 from lib2to3.pgen2 import tokenize as lib2to3_tokenize |
110 if line.endswith('\\'): |
213 encoding = lib2to3_tokenize.detect_encoding(input_file.readline)[0] |
111 return True |
214 |
112 |
215 # Check for correctness of encoding. |
113 if re.match(MULTILINE_ASSIGNMENT_REGEX, line): |
216 with self.open_with_encoding(filename, encoding) as input_file: |
114 return True |
217 input_file.read() |
115 |
218 |
116 if re.match(r'^[()\[\]{}\s]+$', line): |
219 return encoding |
117 return True |
220 except (SyntaxError, LookupError, UnicodeDecodeError): |
118 |
221 return 'latin-1' |
119 return False |
222 |
120 |
223 def update_whitelist(self, new_whitelist, extend_default=True): |
121 |
224 """Updates the whitelist.""" |
122 def commented_out_code_line_numbers(source, aggressive=True): |
225 if extend_default: |
123 """Yield line numbers of commented-out code.""" |
226 self.WHITELIST_REGEX = re.compile( |
124 sio = io.StringIO(source) |
227 r'|'.join(list(self.DEFAULT_WHITELIST) + new_whitelist), |
125 try: |
228 flags=re.IGNORECASE) |
126 for token in tokenize.generate_tokens(sio.readline): |
229 else: |
127 token_type = token[0] |
230 self.WHITELIST_REGEX = re.compile( |
128 start_row = token[2][0] |
231 r'|'.join(new_whitelist), |
129 line = token[4] |
232 flags=re.IGNORECASE) |
130 |
|
131 if (token_type == tokenize.COMMENT and |
|
132 line.lstrip().startswith('#') and |
|
133 not line.lstrip().startswith('##') and |
|
134 # modified from original file (line added) |
|
135 comment_contains_code(line, aggressive)): |
|
136 yield start_row |
|
137 except (tokenize.TokenError, IndentationError): |
|
138 pass |
|
139 |
|
140 |
|
141 def filter_commented_out_code(source, aggressive=True): |
|
142 """Yield code with commented out code removed.""" |
|
143 marked_lines = list(commented_out_code_line_numbers(source, |
|
144 aggressive)) |
|
145 sio = io.StringIO(source) |
|
146 previous_line = '' |
|
147 for line_number, line in enumerate(sio.readlines(), start=1): |
|
148 if (line_number not in marked_lines or |
|
149 previous_line.rstrip().endswith('\\')): |
|
150 yield line |
|
151 previous_line = line |
|
152 |
|
153 |
|
154 def fix_file(filename, args, standard_out): |
|
155 """Run filter_commented_out_code() on file.""" |
|
156 encoding = detect_encoding(filename) |
|
157 with open_with_encoding(filename, encoding=encoding) as input_file: |
|
158 source = input_file.read() |
|
159 |
|
160 filtered_source = ''.join(filter_commented_out_code(source, |
|
161 args.aggressive)) |
|
162 |
|
163 if source != filtered_source: |
|
164 if args.in_place: |
|
165 with open_with_encoding(filename, mode='w', |
|
166 encoding=encoding) as output_file: |
|
167 output_file.write(filtered_source) |
|
168 else: |
|
169 diff = difflib.unified_diff( |
|
170 source.splitlines(), |
|
171 filtered_source.splitlines(), |
|
172 'before/' + filename, |
|
173 'after/' + filename, |
|
174 lineterm='') |
|
175 standard_out.write('\n'.join(list(diff) + [''])) |
|
176 |
|
177 |
|
178 def open_with_encoding(filename, encoding, mode='r'): |
|
179 """Return opened file with a specific encoding.""" |
|
180 return io.open(filename, mode=mode, encoding=encoding, |
|
181 newline='') # Preserve line endings |
|
182 |
|
183 |
|
184 def detect_encoding(filename): |
|
185 """Return file encoding.""" |
|
186 try: |
|
187 with open(filename, 'rb') as input_file: |
|
188 from lib2to3.pgen2 import tokenize as lib2to3_tokenize |
|
189 encoding = lib2to3_tokenize.detect_encoding(input_file.readline)[0] |
|
190 |
|
191 # Check for correctness of encoding. |
|
192 with open_with_encoding(filename, encoding) as input_file: |
|
193 input_file.read() |
|
194 |
|
195 return encoding |
|
196 except (SyntaxError, LookupError, UnicodeDecodeError): |
|
197 return 'latin-1' |
|
198 |
233 |
199 |
234 |
200 def main(argv, standard_out, standard_error): |
235 def main(argv, standard_out, standard_error): |
201 """Main entry point.""" |
236 """Main entry point.""" |
202 import argparse |
237 import argparse |
206 parser.add_argument('-r', '--recursive', action='store_true', |
241 parser.add_argument('-r', '--recursive', action='store_true', |
207 help='drill down directories recursively') |
242 help='drill down directories recursively') |
208 parser.add_argument('-a', '--aggressive', action='store_true', |
243 parser.add_argument('-a', '--aggressive', action='store_true', |
209 help='make more aggressive changes; ' |
244 help='make more aggressive changes; ' |
210 'this may result in false positives') |
245 'this may result in false positives') |
|
246 parser.add_argument('-e', '--error', action="store_true", |
|
247 help="Exit code based on result of check") |
211 parser.add_argument('--version', action='version', |
248 parser.add_argument('--version', action='version', |
212 version='%(prog)s ' + __version__) |
249 version='%(prog)s ' + __version__) |
|
250 parser.add_argument('--whitelist', action="store", |
|
251 help=( |
|
252 'String of "#" separated comment beginnings to whitelist. ' |
|
253 'Single parts are interpreted as regex. ' |
|
254 'OVERWRITING the default whitelist: {}' |
|
255 ).format(Eradicator.DEFAULT_WHITELIST)) |
|
256 parser.add_argument('--whitelist-extend', action="store", |
|
257 help=( |
|
258 'String of "#" separated comment beginnings to whitelist ' |
|
259 'Single parts are interpreted as regex. ' |
|
260 'Overwrites --whitelist. ' |
|
261 'EXTENDING the default whitelist: {} ' |
|
262 ).format(Eradicator.DEFAULT_WHITELIST)) |
213 parser.add_argument('files', nargs='+', help='files to format') |
263 parser.add_argument('files', nargs='+', help='files to format') |
214 |
264 |
215 args = parser.parse_args(argv[1:]) |
265 args = parser.parse_args(argv[1:]) |
216 |
266 |
|
267 eradicator = Eradicator() |
|
268 |
|
269 if args.whitelist_extend: |
|
270 eradicator.update_whitelist(args.whitelist_extend.split('#'), True) |
|
271 elif args.whitelist: |
|
272 eradicator.update_whitelist(args.whitelist.split('#'), False) |
|
273 |
217 filenames = list(set(args.files)) |
274 filenames = list(set(args.files)) |
|
275 change_or_error = False |
218 while filenames: |
276 while filenames: |
219 name = filenames.pop(0) |
277 name = filenames.pop(0) |
220 if args.recursive and os.path.isdir(name): |
278 if args.recursive and os.path.isdir(name): |
221 for root, directories, children in os.walk('{}'.format(name)): |
279 for root, directories, children in os.walk('{}'.format(name)): |
222 filenames += [os.path.join(root, f) for f in children |
280 filenames += [os.path.join(root, f) for f in children |