Joshua Lochner commited on
Commit
4f1ee08
1 Parent(s): 13df84e

Add tokens for each category

Browse files
Files changed (1) hide show
  1. src/shared.py +19 -4
src/shared.py CHANGED
@@ -7,18 +7,19 @@ from typing import Optional
7
  from dataclasses import dataclass, field
8
  from enum import Enum
9
 
 
 
 
 
10
  class CustomTokens(Enum):
11
  EXTRACT_SEGMENTS_PREFIX = 'EXTRACT_SEGMENTS: '
12
 
 
13
  URL = 'URL_TOKEN'
14
  HYPHENATED_URL = 'HYPHENATED_URL_TOKEN'
15
  NUMBER_PERCENTAGE = 'NUMBER_PERCENTAGE_TOKEN'
16
  NUMBER = 'NUMBER_TOKEN'
17
 
18
- START_SEGMENT = 'START_SEGMENT_TOKEN'
19
- END_SEGMENT = 'END_SEGMENT_TOKEN'
20
- NO_SEGMENT = 'NO_SEGMENT_FOUND'
21
-
22
  SHORT_HYPHENATED = 'SHORT_HYPHENATED_TOKEN'
23
  LONG_WORD = 'LONG_WORD_TOKEN'
24
 
@@ -28,6 +29,20 @@ class CustomTokens(Enum):
28
  LAUGHTER = '[Laughter]'
29
 
30
  PROFANITY = 'PROFANITY_TOKEN'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  @classmethod
33
  def custom_tokens(cls):
 
7
  from dataclasses import dataclass, field
8
  from enum import Enum
9
 
10
+
11
+ START_SEGMENT_TEMPLATE = 'START_{}_TOKEN'
12
+ END_SEGMENT_TEMPLATE = 'END_{}_TOKEN'
13
+
14
  class CustomTokens(Enum):
15
  EXTRACT_SEGMENTS_PREFIX = 'EXTRACT_SEGMENTS: '
16
 
17
+ # Preprocessing tokens
18
  URL = 'URL_TOKEN'
19
  HYPHENATED_URL = 'HYPHENATED_URL_TOKEN'
20
  NUMBER_PERCENTAGE = 'NUMBER_PERCENTAGE_TOKEN'
21
  NUMBER = 'NUMBER_TOKEN'
22
 
 
 
 
 
23
  SHORT_HYPHENATED = 'SHORT_HYPHENATED_TOKEN'
24
  LONG_WORD = 'LONG_WORD_TOKEN'
25
 
 
29
  LAUGHTER = '[Laughter]'
30
 
31
  PROFANITY = 'PROFANITY_TOKEN'
32
+
33
+ # Segment tokens
34
+ NO_SEGMENT = 'NO_SEGMENT_TOKEN'
35
+
36
+ START_SPONSOR = START_SEGMENT_TEMPLATE.format('SPONSOR')
37
+ END_SPONSOR = END_SEGMENT_TEMPLATE.format('SPONSOR')
38
+
39
+ START_SELFPROMO = START_SEGMENT_TEMPLATE.format('SELFPROMO')
40
+ END_SELFPROMO = END_SEGMENT_TEMPLATE.format('SELFPROMO')
41
+
42
+ START_INTERACTION = START_SEGMENT_TEMPLATE.format('INTERACTION')
43
+ END_INTERACTION = END_SEGMENT_TEMPLATE.format('INTERACTION')
44
+
45
+ BETWEEN_SEGMENTS = 'BETWEEN_SEGMENTS_TOKEN'
46
 
47
  @classmethod
48
  def custom_tokens(cls):